diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index dde84e2090b90..2f8777fffdc92 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -405,8 +405,6 @@ class AMDGPUTargetLowering : public TargetLowering { // are using vector compares until that is fixed. return true; } - - bool softPromoteHalfType() const override { return false; } }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 33a23ffb81926..5cd7a61d2c936 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1480,6 +1480,9 @@ SDValue R600TargetLowering::LowerFormalArguments( MemVT = MemVT.getVectorElementType(); } + if (VT.isInteger() && !MemVT.isInteger()) + MemVT = MemVT.changeTypeToInteger(); + if (AMDGPU::isShader(CallConv)) { Register Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass); SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 2ce67c3848bae..a62a1828a6e93 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -30728,6 +30728,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v32i32_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -30744,650 +30746,219 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v53, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v55, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v42, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v44, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v47, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v62 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 -; SI-NEXT: v_mov_b32_e32 v32, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v59 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v53, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v55, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v42, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v44, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v47, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 -; SI-NEXT: v_mov_b32_e32 v38, v28 -; SI-NEXT: v_mov_b32_e32 v36, v30 -; SI-NEXT: v_mov_b32_e32 v46, v27 -; SI-NEXT: v_mov_b32_e32 v31, v29 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v63 +; SI-NEXT: v_or_b32_e32 v2, v2, v44 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v42 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v61 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v47 +; SI-NEXT: v_or_b32_e32 v3, v3, v44 +; SI-NEXT: v_or_b32_e32 v5, v5, v42 +; SI-NEXT: v_or_b32_e32 v6, v6, v55 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v59 +; SI-NEXT: v_or_b32_e32 v10, v10, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v58 +; SI-NEXT: v_or_b32_e32 v12, v12, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v57 +; SI-NEXT: v_or_b32_e32 v14, v14, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v56 +; SI-NEXT: v_or_b32_e32 v16, v16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v18, v18, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_or_b32_e32 v22, v22, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v41 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v35 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v63 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v40 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v53 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v37 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v34 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v50 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v54 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v38 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v42 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -31404,12 +30975,44 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v36 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v26, v26, v34 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v54 +; SI-NEXT: v_or_b32_e32 v28, v28, v33 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v52 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v55 +; SI-NEXT: v_or_b32_e32 v9, v9, v53 +; SI-NEXT: v_or_b32_e32 v11, v11, v50 +; SI-NEXT: v_or_b32_e32 v13, v13, v49 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_or_b32_e32 v17, v17, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v21, v21, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v25, v25, v35 +; SI-NEXT: v_or_b32_e32 v27, v27, v34 +; SI-NEXT: v_or_b32_e32 v29, v29, v33 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -31585,197 +31188,128 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-LABEL: bitcast_v32i32_to_v64f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v32, s30, 0 +; SI-NEXT: v_writelane_b32 v32, s31, 1 +; SI-NEXT: v_writelane_b32 v32, s34, 2 +; SI-NEXT: v_writelane_b32 v32, s35, 3 +; SI-NEXT: v_writelane_b32 v32, s36, 4 +; SI-NEXT: v_writelane_b32 v32, s37, 5 +; SI-NEXT: v_writelane_b32 v32, s38, 6 ; SI-NEXT: v_mov_b32_e32 v19, s16 -; SI-NEXT: v_readfirstlane_b32 s40, v19 +; SI-NEXT: v_writelane_b32 v32, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s56, v19 ; SI-NEXT: v_mov_b32_e32 v19, s17 -; SI-NEXT: v_readfirstlane_b32 s41, v19 +; SI-NEXT: v_writelane_b32 v32, s48, 8 +; SI-NEXT: v_readfirstlane_b32 s57, v19 ; SI-NEXT: v_mov_b32_e32 v19, s18 -; SI-NEXT: v_readfirstlane_b32 s42, v19 +; SI-NEXT: v_writelane_b32 v32, s49, 9 +; SI-NEXT: v_readfirstlane_b32 s46, v19 ; SI-NEXT: v_mov_b32_e32 v19, s19 -; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_writelane_b32 v32, s50, 10 +; SI-NEXT: v_readfirstlane_b32 s47, v19 ; SI-NEXT: v_mov_b32_e32 v19, s20 +; SI-NEXT: v_writelane_b32 v32, s51, 11 ; SI-NEXT: v_readfirstlane_b32 s44, v19 ; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_writelane_b32 v32, s52, 12 ; SI-NEXT: v_readfirstlane_b32 s45, v19 ; SI-NEXT: v_mov_b32_e32 v19, s22 -; SI-NEXT: v_readfirstlane_b32 s46, v19 +; SI-NEXT: v_writelane_b32 v32, s53, 13 +; SI-NEXT: v_readfirstlane_b32 s42, v19 ; SI-NEXT: v_mov_b32_e32 v19, s23 -; SI-NEXT: v_readfirstlane_b32 s47, v19 +; SI-NEXT: v_writelane_b32 v32, s54, 14 +; SI-NEXT: v_readfirstlane_b32 s43, v19 ; SI-NEXT: v_mov_b32_e32 v19, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v19 +; SI-NEXT: v_writelane_b32 v32, s55, 15 +; SI-NEXT: v_readfirstlane_b32 s40, v19 ; SI-NEXT: v_mov_b32_e32 v19, s25 -; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_writelane_b32 v32, s64, 16 +; SI-NEXT: v_readfirstlane_b32 s41, v19 ; SI-NEXT: v_mov_b32_e32 v19, s26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v63, s30, 0 -; SI-NEXT: v_readfirstlane_b32 s26, v19 +; SI-NEXT: v_writelane_b32 v32, s65, 17 +; SI-NEXT: v_readfirstlane_b32 s24, v19 ; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_writelane_b32 v63, s31, 1 -; SI-NEXT: v_readfirstlane_b32 s27, v19 +; SI-NEXT: v_writelane_b32 v32, s66, 18 +; SI-NEXT: v_readfirstlane_b32 s25, v19 ; SI-NEXT: v_mov_b32_e32 v19, s28 -; SI-NEXT: v_writelane_b32 v63, s34, 2 -; SI-NEXT: v_readfirstlane_b32 s28, v19 +; SI-NEXT: v_writelane_b32 v32, s67, 19 +; SI-NEXT: v_readfirstlane_b32 s22, v19 ; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_writelane_b32 v63, s35, 3 -; SI-NEXT: v_readfirstlane_b32 s29, v19 -; SI-NEXT: v_readfirstlane_b32 s23, v0 -; SI-NEXT: v_readfirstlane_b32 s22, v1 -; SI-NEXT: v_readfirstlane_b32 s21, v2 -; SI-NEXT: v_readfirstlane_b32 s20, v3 -; SI-NEXT: v_readfirstlane_b32 s19, v4 -; SI-NEXT: v_readfirstlane_b32 s18, v5 -; SI-NEXT: v_readfirstlane_b32 s17, v6 -; SI-NEXT: v_readfirstlane_b32 s16, v7 -; SI-NEXT: v_readfirstlane_b32 s15, v8 -; SI-NEXT: v_readfirstlane_b32 s14, v9 -; SI-NEXT: v_readfirstlane_b32 s13, v10 -; SI-NEXT: v_readfirstlane_b32 s12, v11 -; SI-NEXT: v_readfirstlane_b32 s11, v12 -; SI-NEXT: v_readfirstlane_b32 s10, v13 -; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_writelane_b32 v32, s68, 20 +; SI-NEXT: v_readfirstlane_b32 s23, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v0 +; SI-NEXT: v_readfirstlane_b32 s21, v1 +; SI-NEXT: v_readfirstlane_b32 s18, v2 +; SI-NEXT: v_readfirstlane_b32 s19, v3 +; SI-NEXT: v_readfirstlane_b32 s16, v4 +; SI-NEXT: v_readfirstlane_b32 s17, v5 +; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_readfirstlane_b32 s15, v7 +; SI-NEXT: v_readfirstlane_b32 s12, v8 +; SI-NEXT: v_readfirstlane_b32 s13, v9 +; SI-NEXT: v_readfirstlane_b32 s10, v10 +; SI-NEXT: v_readfirstlane_b32 s11, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v12 +; SI-NEXT: v_readfirstlane_b32 s9, v13 +; SI-NEXT: v_readfirstlane_b32 s6, v14 ; SI-NEXT: v_readfirstlane_b32 s7, v15 -; SI-NEXT: v_readfirstlane_b32 s6, v16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v17 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v16 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v17 +; SI-NEXT: v_writelane_b32 v32, s69, 21 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s47, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s46, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s45, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s44, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s40 +; SI-NEXT: s_lshr_b32 s38, s5, 16 +; SI-NEXT: s_lshr_b32 s39, s7, 16 +; SI-NEXT: s_lshr_b32 s48, s9, 16 +; SI-NEXT: s_lshr_b32 s49, s11, 16 +; SI-NEXT: s_lshr_b32 s50, s13, 16 +; SI-NEXT: s_lshr_b32 s51, s15, 16 +; SI-NEXT: s_lshr_b32 s52, s17, 16 +; SI-NEXT: s_lshr_b32 s53, s19, 16 +; SI-NEXT: s_lshr_b32 s54, s21, 16 +; SI-NEXT: s_lshr_b32 s55, s23, 16 +; SI-NEXT: s_lshr_b32 s64, s25, 16 +; SI-NEXT: s_lshr_b32 s65, s41, 16 +; SI-NEXT: s_lshr_b32 s66, s43, 16 +; SI-NEXT: s_lshr_b32 s67, s45, 16 +; SI-NEXT: s_lshr_b32 s68, s47, 16 +; SI-NEXT: s_lshr_b32 s69, s57, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s35, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s35 -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s44, s44, 3 -; SI-NEXT: s_add_i32 s45, s45, 3 -; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_add_i32 s57, s57, 3 +; SI-NEXT: s_add_i32 s56, s56, 3 ; SI-NEXT: s_add_i32 s47, s47, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 @@ -31790,335 +31324,233 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_lshr_b32 s5, s41, 16 -; SI-NEXT: s_lshr_b32 s56, s42, 16 -; SI-NEXT: s_lshr_b32 s57, s43, 16 -; SI-NEXT: s_lshr_b32 s58, s44, 16 -; SI-NEXT: s_lshr_b32 s59, s45, 16 -; SI-NEXT: s_lshr_b32 s60, s46, 16 -; SI-NEXT: s_lshr_b32 s61, s47, 16 -; SI-NEXT: s_lshr_b32 s62, s24, 16 -; SI-NEXT: s_lshr_b32 s63, s25, 16 -; SI-NEXT: s_lshr_b32 s72, s26, 16 -; SI-NEXT: s_lshr_b32 s73, s27, 16 -; SI-NEXT: s_lshr_b32 s74, s28, 16 -; SI-NEXT: s_lshr_b32 s75, s29, 16 -; SI-NEXT: s_lshr_b32 s76, s23, 16 -; SI-NEXT: s_lshr_b32 s77, s22, 16 -; SI-NEXT: s_lshr_b32 s78, s21, 16 -; SI-NEXT: s_lshr_b32 s79, s20, 16 -; SI-NEXT: s_lshr_b32 s88, s19, 16 -; SI-NEXT: s_lshr_b32 s89, s18, 16 -; SI-NEXT: s_lshr_b32 s90, s17, 16 -; SI-NEXT: s_lshr_b32 s91, s16, 16 -; SI-NEXT: s_lshr_b32 s92, s15, 16 -; SI-NEXT: s_lshr_b32 s93, s14, 16 -; SI-NEXT: s_lshr_b32 s94, s13, 16 -; SI-NEXT: s_lshr_b32 s95, s12, 16 -; SI-NEXT: s_lshr_b32 vcc_lo, s11, 16 -; SI-NEXT: s_lshr_b32 vcc_hi, s10, 16 -; SI-NEXT: s_lshr_b32 s30, s8, 16 -; SI-NEXT: s_lshr_b32 s31, s7, 16 -; SI-NEXT: s_lshr_b32 s34, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s24 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v56, s47 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v57, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s44 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s43 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v61, s42 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v62, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v31, s34 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s30 -; SI-NEXT: v_cvt_f32_f16_e32 v26, vcc_hi -; SI-NEXT: v_cvt_f32_f16_e32 v27, vcc_lo -; SI-NEXT: v_cvt_f32_f16_e32 v24, s95 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s94 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 16 +; SI-NEXT: s_lshr_b32 s38, s5, 16 +; SI-NEXT: s_lshr_b32 s39, s7, 16 +; SI-NEXT: s_lshr_b32 s48, s9, 16 +; SI-NEXT: s_lshr_b32 s49, s11, 16 +; SI-NEXT: s_lshr_b32 s50, s13, 16 +; SI-NEXT: s_lshr_b32 s51, s15, 16 +; SI-NEXT: s_lshr_b32 s52, s17, 16 +; SI-NEXT: s_lshr_b32 s53, s19, 16 +; SI-NEXT: s_lshr_b32 s54, s21, 16 +; SI-NEXT: s_lshr_b32 s55, s23, 16 +; SI-NEXT: s_lshr_b32 s64, s25, 16 +; SI-NEXT: s_lshr_b32 s65, s41, 16 +; SI-NEXT: s_lshr_b32 s66, s43, 16 +; SI-NEXT: s_lshr_b32 s67, s45, 16 +; SI-NEXT: s_lshr_b32 s68, s47, 16 +; SI-NEXT: s_lshr_b32 s69, s57, 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 ; SI-NEXT: .LBB21_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v62 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v2, v30, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v3, v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v58 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v59 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v59 -; SI-NEXT: v_or_b32_e32 v5, v5, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v8, v30, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v10, v30, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v12, v30, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_or_b32_e32 v14, v30, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_or_b32_e32 v16, v30, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_or_b32_e32 v18, v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_or_b32_e32 v20, v30, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v39 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v4, v58, v4 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v9, v46, v9 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v15, v40, v15 -; SI-NEXT: v_or_b32_e32 v24, v30, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v37, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v26, v30, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v35 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_or_b32_e32 v17, v54, v17 -; SI-NEXT: v_or_b32_e32 v19, v52, v19 -; SI-NEXT: v_or_b32_e32 v21, v50, v21 -; SI-NEXT: v_or_b32_e32 v23, v48, v23 -; SI-NEXT: v_or_b32_e32 v25, v38, v25 -; SI-NEXT: v_or_b32_e32 v27, v36, v27 -; SI-NEXT: v_or_b32_e32 v29, v34, v29 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s27, s36, 16 +; SI-NEXT: s_and_b32 s29, s56, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s57, 0xffff +; SI-NEXT: s_lshl_b32 s56, s69, 16 +; SI-NEXT: s_or_b32 s29, s29, s56 +; SI-NEXT: s_lshl_b32 s56, s34, 16 +; SI-NEXT: s_and_b32 s46, s46, 0xffff +; SI-NEXT: s_or_b32 s46, s46, s56 +; SI-NEXT: s_and_b32 s47, s47, 0xffff +; SI-NEXT: s_lshl_b32 s56, s68, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_lshl_b32 s56, s30, 16 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_or_b32 s44, s44, s56 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_lshl_b32 s56, s67, 16 +; SI-NEXT: s_or_b32 s45, s45, s56 +; SI-NEXT: s_lshl_b32 s56, s94, 16 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s42, s42, s56 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s56, s66, 16 +; SI-NEXT: s_or_b32 s43, s43, s56 +; SI-NEXT: s_lshl_b32 s56, s92, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s56 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s56, s65, 16 +; SI-NEXT: s_or_b32 s41, s41, s56 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s56, s90, 16 +; SI-NEXT: s_or_b32 s24, s24, s56 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s56, s64, 16 +; SI-NEXT: s_or_b32 s25, s25, s56 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s56, s88, 16 +; SI-NEXT: s_or_b32 s22, s22, s56 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s56, s55, 16 +; SI-NEXT: s_or_b32 s23, s23, s56 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s56, s78, 16 +; SI-NEXT: s_or_b32 s20, s20, s56 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s54, 16 +; SI-NEXT: s_or_b32 s21, s21, s56 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s56, s76, 16 +; SI-NEXT: s_or_b32 s18, s18, s56 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s56, s53, 16 +; SI-NEXT: s_or_b32 s19, s19, s56 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s56, s74, 16 +; SI-NEXT: s_or_b32 s16, s16, s56 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s56, s52, 16 +; SI-NEXT: s_or_b32 s17, s17, s56 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s56, s72, 16 +; SI-NEXT: s_or_b32 s14, s14, s56 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s56, s51, 16 +; SI-NEXT: s_or_b32 s15, s15, s56 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s56, s62, 16 +; SI-NEXT: s_or_b32 s12, s12, s56 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s56, s50, 16 +; SI-NEXT: s_or_b32 s13, s13, s56 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s56, s58, 16 +; SI-NEXT: s_or_b32 s10, s10, s56 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s56, s49, 16 +; SI-NEXT: s_or_b32 s11, s11, s56 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s56, s60, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s56 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s56, s48, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s39, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s38, 16 +; SI-NEXT: s_or_b32 s9, s9, s56 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s46 +; SI-NEXT: v_mov_b32_e32 v3, s47 +; SI-NEXT: v_mov_b32_e32 v4, s44 +; SI-NEXT: v_mov_b32_e32 v5, s45 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s40 +; SI-NEXT: v_mov_b32_e32 v9, s41 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_mov_b32_e32 v20, s14 +; SI-NEXT: v_mov_b32_e32 v21, s15 +; SI-NEXT: v_mov_b32_e32 v22, s12 +; SI-NEXT: v_mov_b32_e32 v23, s13 +; SI-NEXT: v_mov_b32_e32 v24, s10 +; SI-NEXT: v_mov_b32_e32 v25, s11 +; SI-NEXT: v_mov_b32_e32 v26, s8 +; SI-NEXT: v_mov_b32_e32 v27, s9 +; SI-NEXT: v_mov_b32_e32 v28, s6 +; SI-NEXT: v_mov_b32_e32 v29, s7 +; SI-NEXT: v_mov_b32_e32 v30, s4 +; SI-NEXT: v_mov_b32_e32 v31, s5 +; SI-NEXT: v_readlane_b32 s69, v32, 21 +; SI-NEXT: v_readlane_b32 s68, v32, 20 +; SI-NEXT: v_readlane_b32 s67, v32, 19 +; SI-NEXT: v_readlane_b32 s66, v32, 18 +; SI-NEXT: v_readlane_b32 s65, v32, 17 +; SI-NEXT: v_readlane_b32 s64, v32, 16 +; SI-NEXT: v_readlane_b32 s55, v32, 15 +; SI-NEXT: v_readlane_b32 s54, v32, 14 +; SI-NEXT: v_readlane_b32 s53, v32, 13 +; SI-NEXT: v_readlane_b32 s52, v32, 12 +; SI-NEXT: v_readlane_b32 s51, v32, 11 +; SI-NEXT: v_readlane_b32 s50, v32, 10 +; SI-NEXT: v_readlane_b32 s49, v32, 9 +; SI-NEXT: v_readlane_b32 s48, v32, 8 +; SI-NEXT: v_readlane_b32 s39, v32, 7 +; SI-NEXT: v_readlane_b32 s38, v32, 6 +; SI-NEXT: v_readlane_b32 s37, v32, 5 +; SI-NEXT: v_readlane_b32 s36, v32, 4 +; SI-NEXT: v_readlane_b32 s35, v32, 3 +; SI-NEXT: v_readlane_b32 s34, v32, 2 +; SI-NEXT: v_readlane_b32 s31, v32, 1 +; SI-NEXT: v_readlane_b32 s30, v32, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB21_2 ; ; VI-LABEL: bitcast_v32i32_to_v64f16_scalar: @@ -32381,342 +31813,208 @@ define <32 x i32> @bitcast_v64f16_to_v32i32(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v60, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_mov_b32_e32 v52, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_mov_b32_e32 v53, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_mov_b32_e32 v40, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_mov_b32_e32 v41, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_mov_b32_e32 v42, v12 +; SI-NEXT: v_mov_b32_e32 v43, v11 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_mov_b32_e32 v45, v9 +; SI-NEXT: v_mov_b32_e32 v46, v8 +; SI-NEXT: v_mov_b32_e32 v47, v7 +; SI-NEXT: v_mov_b32_e32 v56, v6 +; SI-NEXT: v_mov_b32_e32 v57, v5 +; SI-NEXT: v_mov_b32_e32 v58, v4 +; SI-NEXT: v_mov_b32_e32 v59, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v62 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_or_b32_e32 v18, v59, v18 -; SI-NEXT: v_or_b32_e32 v19, v57, v19 -; SI-NEXT: v_or_b32_e32 v20, v47, v20 -; SI-NEXT: v_or_b32_e32 v21, v45, v21 -; SI-NEXT: v_or_b32_e32 v22, v43, v22 -; SI-NEXT: v_or_b32_e32 v23, v41, v23 -; SI-NEXT: v_or_b32_e32 v24, v55, v24 -; SI-NEXT: v_or_b32_e32 v25, v53, v25 -; SI-NEXT: v_or_b32_e32 v26, v51, v26 -; SI-NEXT: v_or_b32_e32 v27, v49, v27 -; SI-NEXT: v_or_b32_e32 v28, v38, v28 -; SI-NEXT: v_or_b32_e32 v29, v36, v29 -; SI-NEXT: v_or_b32_e32 v30, v34, v30 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v33 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 @@ -32728,429 +32026,474 @@ define <32 x i32> @bitcast_v64f16_to_v32i32(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v62 -; SI-NEXT: v_or_b32_e32 v17, v61, v17 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: .LBB22_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v43 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v38 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v46 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v45 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v44 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v43 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v42 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v53 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v51 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v62 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v56 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v46 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v45 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v49 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 ; SI-NEXT: v_or_b32_e32 v31, v33, v31 @@ -33413,8 +32756,6 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-LABEL: bitcast_v64f16_to_v32i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -33431,696 +32772,396 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v47, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_mov_b32_e32 v32, v17 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_mov_b32_e32 v41, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v40 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v57 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v0 -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB23_2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 -; SI-NEXT: v_or_b32_e32 v5, v11, v5 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v15, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_or_b32_e32 v10, v62, v10 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_or_b32_e32 v11, v35, v11 -; SI-NEXT: v_mov_b32_e32 v35, v59 -; SI-NEXT: v_or_b32_e32 v12, v59, v12 -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_or_b32_e32 v14, v47, v14 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_or_b32_e32 v15, v45, v15 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_or_b32_e32 v18, v40, v18 -; SI-NEXT: v_or_b32_e32 v19, v55, v19 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_or_b32_e32 v20, v54, v20 -; SI-NEXT: v_or_b32_e32 v21, v53, v21 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_or_b32_e32 v22, v52, v22 -; SI-NEXT: v_or_b32_e32 v23, v51, v23 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_or_b32_e32 v24, v50, v24 -; SI-NEXT: v_or_b32_e32 v25, v49, v25 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_or_b32_e32 v26, v48, v26 -; SI-NEXT: v_or_b32_e32 v27, v39, v27 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_or_b32_e32 v28, v38, v28 -; SI-NEXT: v_or_b32_e32 v29, v63, v29 -; SI-NEXT: v_mov_b32_e32 v63, v60 -; SI-NEXT: v_or_b32_e32 v30, v60, v30 -; SI-NEXT: v_or_b32_e32 v31, v57, v31 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v8, v17, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v42 -; SI-NEXT: v_or_b32_e32 v9, v36, v9 -; SI-NEXT: v_mov_b32_e32 v36, v34 -; SI-NEXT: v_or_b32_e32 v16, v43, v16 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_or_b32_e32 v17, v41, v17 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: s_branch .LBB23_3 -; SI-NEXT: .LBB23_2: -; SI-NEXT: v_mov_b32_e32 v36, v34 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v35, v59 -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v63, v60 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: .LBB23_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v60, v39 -; SI-NEXT: v_mov_b32_e32 v38, v49 -; SI-NEXT: v_mov_b32_e32 v39, v51 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v49, v55 -; SI-NEXT: v_mov_b32_e32 v50, v41 -; SI-NEXT: v_mov_b32_e32 v51, v43 -; SI-NEXT: v_mov_b32_e32 v52, v45 -; SI-NEXT: v_mov_b32_e32 v53, v47 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v35 -; SI-NEXT: v_mov_b32_e32 v34, v36 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_cbranch_vccnz .LBB23_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_mov_b32_e32 v47, v16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_mov_b32_e32 v46, v17 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_mov_b32_e32 v42, v21 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v13 +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v61, v11 +; SI-NEXT: v_mov_b32_e32 v60, v10 +; SI-NEXT: v_mov_b32_e32 v59, v9 +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v55 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v53 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v63 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v49 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v48 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v60 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v38 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v37 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v46 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v45 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v43 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -34133,33 +33174,25 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v59 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v58 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 @@ -34167,14 +33200,21 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: .LBB23_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: .LBB23_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -34191,8 +33231,27 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_mov_b32_e32 v47, v16 +; SI-NEXT: v_mov_b32_e32 v46, v17 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v42, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v13 +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v61, v11 +; SI-NEXT: v_mov_b32_e32 v60, v10 +; SI-NEXT: v_mov_b32_e32 v59, v9 +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB23_2 ; ; VI-LABEL: bitcast_v64f16_to_v32i32_scalar: ; VI: ; %bb.0: @@ -67607,6 +66666,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v32f32_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -67623,650 +66684,219 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v53, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v55, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v42, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v44, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v47, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_f32_e32 v44, 1.0, v62 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 -; SI-NEXT: v_mov_b32_e32 v32, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_add_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v59 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 ; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v53, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v55, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v42, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v44, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v47, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 -; SI-NEXT: v_mov_b32_e32 v38, v28 -; SI-NEXT: v_mov_b32_e32 v36, v30 -; SI-NEXT: v_mov_b32_e32 v46, v27 -; SI-NEXT: v_mov_b32_e32 v31, v29 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v63 +; SI-NEXT: v_or_b32_e32 v2, v2, v44 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v42 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v61 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v47 +; SI-NEXT: v_or_b32_e32 v3, v3, v44 +; SI-NEXT: v_or_b32_e32 v5, v5, v42 +; SI-NEXT: v_or_b32_e32 v6, v6, v55 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v59 +; SI-NEXT: v_or_b32_e32 v10, v10, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v58 +; SI-NEXT: v_or_b32_e32 v12, v12, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v57 +; SI-NEXT: v_or_b32_e32 v14, v14, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v56 +; SI-NEXT: v_or_b32_e32 v16, v16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v18, v18, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_or_b32_e32 v22, v22, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v41 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v35 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v63 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v40 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v53 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v37 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v34 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v50 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v54 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v38 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v42 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -68283,12 +66913,44 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v36 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v26, v26, v34 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v54 +; SI-NEXT: v_or_b32_e32 v28, v28, v33 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v52 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v55 +; SI-NEXT: v_or_b32_e32 v9, v9, v53 +; SI-NEXT: v_or_b32_e32 v11, v11, v50 +; SI-NEXT: v_or_b32_e32 v13, v13, v49 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_or_b32_e32 v17, v17, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v21, v21, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v25, v25, v35 +; SI-NEXT: v_or_b32_e32 v27, v27, v34 +; SI-NEXT: v_or_b32_e32 v29, v29, v33 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -68449,6 +67111,21 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: v_mov_b32_e32 v31, s17 +; SI-NEXT: v_mov_b32_e32 v28, s18 +; SI-NEXT: v_mov_b32_e32 v29, s19 +; SI-NEXT: v_mov_b32_e32 v26, s20 +; SI-NEXT: v_mov_b32_e32 v27, s21 +; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v25, s23 +; SI-NEXT: v_mov_b32_e32 v22, s24 +; SI-NEXT: v_mov_b32_e32 v23, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v21, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -68465,542 +67142,259 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, s16 -; SI-NEXT: v_mov_b32_e32 v40, s17 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v41, s18 -; SI-NEXT: v_mov_b32_e32 v55, s19 -; SI-NEXT: v_mov_b32_e32 v54, s20 -; SI-NEXT: v_mov_b32_e32 v53, s21 -; SI-NEXT: v_mov_b32_e32 v52, s22 -; SI-NEXT: v_mov_b32_e32 v51, s23 -; SI-NEXT: v_mov_b32_e32 v50, s24 -; SI-NEXT: v_mov_b32_e32 v49, s25 -; SI-NEXT: v_mov_b32_e32 v39, s26 -; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_mov_b32_e32 v18, s28 -; SI-NEXT: v_mov_b32_e32 v37, s29 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v51 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v48 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v15 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v9 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v2 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[32:33], v[16:17], 16 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v0 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[32:33], v[14:15], 16 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[32:33], v[12:13], 16 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[56:57], v[4:5], 16 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[57:58], v[2:3], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshr_b64 v[58:59], v[0:1], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v54 +; SI-NEXT: v_lshr_b64 v[32:33], v[20:21], 16 +; SI-NEXT: v_mov_b32_e32 v59, v34 +; SI-NEXT: v_lshr_b64 v[33:34], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; SI-NEXT: v_lshr_b64 v[47:48], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[30:31], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_lshr_b64 v[32:33], v[16:17], 16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v10 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[32:33], v[14:15], 16 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[32:33], v[12:13], 16 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshr_b64 v[32:33], v[20:21], 16 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[33:34], v[22:23], 16 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v27, 1.0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v3 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v29, 1.0, v55 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 -; SI-NEXT: v_add_f32_e32 v31, 1.0, v53 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v34, 1.0, v37 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v48 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v52 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v51 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v50 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[56:57], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[24:25], 16 +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v17 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[46:47], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[57:58], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[47:48], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[58:59], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v36, v30, v36 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v52 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v46 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v62 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v45 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v60 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v61 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v42 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v56 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v57 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_or_b32_e32 v37, v30, v31 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v38 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v38, v28, v30 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 +; SI-NEXT: v_or_b32_e32 v39, v28, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v48, v26, v28 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_or_b32_e32 v49, v26, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v34 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v50, v24, v26 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v51 +; SI-NEXT: v_or_b32_e32 v51, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v52, v22, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_or_b32_e32 v53, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v54, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; SI-NEXT: v_or_b32_e32 v55, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v40, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v45 +; SI-NEXT: v_or_b32_e32 v41, v18, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v58 +; SI-NEXT: v_or_b32_e32 v32, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_or_b32_e32 v33, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_or_b32_e32 v34, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_or_b32_e32 v35, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_mov_b32_e32 v3, v39 +; SI-NEXT: v_mov_b32_e32 v4, v48 +; SI-NEXT: v_mov_b32_e32 v5, v49 +; SI-NEXT: v_mov_b32_e32 v6, v50 +; SI-NEXT: v_mov_b32_e32 v7, v51 +; SI-NEXT: v_mov_b32_e32 v8, v52 +; SI-NEXT: v_mov_b32_e32 v9, v53 +; SI-NEXT: v_mov_b32_e32 v10, v54 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_mov_b32_e32 v12, v40 +; SI-NEXT: v_mov_b32_e32 v11, v55 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_mov_b32_e32 v13, v41 +; SI-NEXT: v_mov_b32_e32 v14, v32 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v44 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_mov_b32_e32 v15, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v28, v47 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v58 +; SI-NEXT: v_mov_b32_e32 v2, v38 +; SI-NEXT: v_mov_b32_e32 v16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -69017,112 +67411,55 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v36 +; SI-NEXT: v_mov_b32_e32 v1, v37 +; SI-NEXT: v_mov_b32_e32 v17, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; kill: killed $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; kill: killed $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; kill: killed $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v32f32_to_v64f16_scalar: @@ -69369,342 +67706,208 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v60, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_mov_b32_e32 v52, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_mov_b32_e32 v53, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_mov_b32_e32 v40, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_mov_b32_e32 v41, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_mov_b32_e32 v42, v12 +; SI-NEXT: v_mov_b32_e32 v43, v11 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_mov_b32_e32 v45, v9 +; SI-NEXT: v_mov_b32_e32 v46, v8 +; SI-NEXT: v_mov_b32_e32 v47, v7 +; SI-NEXT: v_mov_b32_e32 v56, v6 +; SI-NEXT: v_mov_b32_e32 v57, v5 +; SI-NEXT: v_mov_b32_e32 v58, v4 +; SI-NEXT: v_mov_b32_e32 v59, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v62 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_or_b32_e32 v18, v59, v18 -; SI-NEXT: v_or_b32_e32 v19, v57, v19 -; SI-NEXT: v_or_b32_e32 v20, v47, v20 -; SI-NEXT: v_or_b32_e32 v21, v45, v21 -; SI-NEXT: v_or_b32_e32 v22, v43, v22 -; SI-NEXT: v_or_b32_e32 v23, v41, v23 -; SI-NEXT: v_or_b32_e32 v24, v55, v24 -; SI-NEXT: v_or_b32_e32 v25, v53, v25 -; SI-NEXT: v_or_b32_e32 v26, v51, v26 -; SI-NEXT: v_or_b32_e32 v27, v49, v27 -; SI-NEXT: v_or_b32_e32 v28, v38, v28 -; SI-NEXT: v_or_b32_e32 v29, v36, v29 -; SI-NEXT: v_or_b32_e32 v30, v34, v30 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v33 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 @@ -69716,429 +67919,474 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v62 -; SI-NEXT: v_or_b32_e32 v17, v61, v17 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v43 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v38 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v46 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v45 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v44 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v43 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v42 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v53 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v51 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v62 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v56 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v46 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v45 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v49 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 ; SI-NEXT: v_or_b32_e32 v31, v33, v31 @@ -70401,8 +68649,6 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-LABEL: bitcast_v64f16_to_v32f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -70419,696 +68665,396 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v47, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v40 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v32, v17 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_mov_b32_e32 v41, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v0 -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB47_2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 -; SI-NEXT: v_or_b32_e32 v5, v11, v5 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v15, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_or_b32_e32 v10, v62, v10 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_or_b32_e32 v11, v35, v11 -; SI-NEXT: v_mov_b32_e32 v35, v59 -; SI-NEXT: v_or_b32_e32 v12, v59, v12 -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_or_b32_e32 v14, v47, v14 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_or_b32_e32 v15, v45, v15 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_or_b32_e32 v18, v40, v18 -; SI-NEXT: v_or_b32_e32 v19, v55, v19 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_or_b32_e32 v20, v54, v20 -; SI-NEXT: v_or_b32_e32 v21, v53, v21 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_or_b32_e32 v22, v52, v22 -; SI-NEXT: v_or_b32_e32 v23, v51, v23 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_or_b32_e32 v24, v50, v24 -; SI-NEXT: v_or_b32_e32 v25, v49, v25 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_or_b32_e32 v26, v48, v26 -; SI-NEXT: v_or_b32_e32 v27, v39, v27 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_or_b32_e32 v28, v38, v28 -; SI-NEXT: v_or_b32_e32 v29, v63, v29 -; SI-NEXT: v_mov_b32_e32 v63, v60 -; SI-NEXT: v_or_b32_e32 v30, v60, v30 -; SI-NEXT: v_or_b32_e32 v31, v57, v31 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v8, v17, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v42 -; SI-NEXT: v_or_b32_e32 v9, v36, v9 -; SI-NEXT: v_mov_b32_e32 v36, v34 -; SI-NEXT: v_or_b32_e32 v16, v43, v16 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_or_b32_e32 v17, v41, v17 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: s_branch .LBB47_3 -; SI-NEXT: .LBB47_2: -; SI-NEXT: v_mov_b32_e32 v36, v34 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v35, v59 -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v63, v60 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: .LBB47_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v60, v39 -; SI-NEXT: v_mov_b32_e32 v38, v49 -; SI-NEXT: v_mov_b32_e32 v39, v51 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v49, v55 -; SI-NEXT: v_mov_b32_e32 v50, v41 -; SI-NEXT: v_mov_b32_e32 v51, v43 -; SI-NEXT: v_mov_b32_e32 v52, v45 -; SI-NEXT: v_mov_b32_e32 v53, v47 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v35 -; SI-NEXT: v_mov_b32_e32 v34, v36 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_cbranch_vccnz .LBB47_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_mov_b32_e32 v47, v16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_mov_b32_e32 v46, v17 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_mov_b32_e32 v42, v21 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v13 +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v61, v11 +; SI-NEXT: v_mov_b32_e32 v60, v10 +; SI-NEXT: v_mov_b32_e32 v59, v9 +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v55 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v53 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v63 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v49 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v48 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v60 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v38 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v37 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v46 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v45 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v43 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -71121,33 +69067,25 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v59 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v58 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 @@ -71155,14 +69093,21 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: .LBB47_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -71179,8 +69124,27 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_mov_b32_e32 v47, v16 +; SI-NEXT: v_mov_b32_e32 v46, v17 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v42, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v13 +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v61, v11 +; SI-NEXT: v_mov_b32_e32 v60, v10 +; SI-NEXT: v_mov_b32_e32 v59, v9 +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v64f16_to_v32f32_scalar: ; VI: ; %bb.0: @@ -102726,6 +100690,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v16i64_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -102742,290 +100708,91 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v42, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v52, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v54, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v41, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v44, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v47, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB64_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v0 -; SI-NEXT: v_addc_u32_e32 v35, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 @@ -103044,349 +100811,116 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 ; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 ; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 ; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_addc_u32_e32 v42, vcc, 0, v62, vcc -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v46 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 -; SI-NEXT: v_mov_b32_e32 v32, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v52, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v54, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v41, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v44, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v47, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_mov_b32_e32 v38, v28 -; SI-NEXT: v_mov_b32_e32 v36, v30 -; SI-NEXT: v_mov_b32_e32 v46, v27 -; SI-NEXT: v_mov_b32_e32 v31, v29 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB64_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v63 +; SI-NEXT: v_or_b32_e32 v2, v2, v44 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v41 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v61 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v47 +; SI-NEXT: v_or_b32_e32 v3, v3, v44 +; SI-NEXT: v_or_b32_e32 v5, v5, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v59 +; SI-NEXT: v_or_b32_e32 v10, v10, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v58 +; SI-NEXT: v_or_b32_e32 v12, v12, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v57 +; SI-NEXT: v_or_b32_e32 v14, v14, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v56 +; SI-NEXT: v_or_b32_e32 v16, v16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v18, v18, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_or_b32_e32 v22, v22, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v35 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v63 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v40 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v53 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v37 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v34 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v50 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v54 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v38 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v44 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -103403,12 +100937,44 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v36 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v26, v26, v34 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v28, v28, v33 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v54 +; SI-NEXT: v_or_b32_e32 v9, v9, v52 +; SI-NEXT: v_or_b32_e32 v11, v11, v50 +; SI-NEXT: v_or_b32_e32 v13, v13, v49 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_or_b32_e32 v17, v17, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v21, v21, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v25, v25, v35 +; SI-NEXT: v_or_b32_e32 v27, v27, v34 +; SI-NEXT: v_or_b32_e32 v29, v29, v33 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -103592,540 +101158,369 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-LABEL: bitcast_v16i64_to_v64f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v32, s30, 0 +; SI-NEXT: v_writelane_b32 v32, s31, 1 +; SI-NEXT: v_writelane_b32 v32, s34, 2 +; SI-NEXT: v_writelane_b32 v32, s35, 3 +; SI-NEXT: v_writelane_b32 v32, s36, 4 +; SI-NEXT: v_writelane_b32 v32, s37, 5 +; SI-NEXT: v_writelane_b32 v32, s38, 6 ; SI-NEXT: v_mov_b32_e32 v19, s16 -; SI-NEXT: v_readfirstlane_b32 s40, v19 +; SI-NEXT: v_writelane_b32 v32, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s56, v19 ; SI-NEXT: v_mov_b32_e32 v19, s17 -; SI-NEXT: v_readfirstlane_b32 s44, v19 +; SI-NEXT: v_writelane_b32 v32, s48, 8 +; SI-NEXT: v_readfirstlane_b32 s57, v19 ; SI-NEXT: v_mov_b32_e32 v19, s18 -; SI-NEXT: v_readfirstlane_b32 s41, v19 +; SI-NEXT: v_writelane_b32 v32, s49, 9 +; SI-NEXT: v_readfirstlane_b32 s46, v19 ; SI-NEXT: v_mov_b32_e32 v19, s19 -; SI-NEXT: v_readfirstlane_b32 s45, v19 +; SI-NEXT: v_writelane_b32 v32, s50, 10 +; SI-NEXT: v_readfirstlane_b32 s47, v19 ; SI-NEXT: v_mov_b32_e32 v19, s20 -; SI-NEXT: v_readfirstlane_b32 s42, v19 +; SI-NEXT: v_writelane_b32 v32, s51, 11 +; SI-NEXT: v_readfirstlane_b32 s44, v19 ; SI-NEXT: v_mov_b32_e32 v19, s21 -; SI-NEXT: v_readfirstlane_b32 s46, v19 +; SI-NEXT: v_writelane_b32 v32, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s45, v19 ; SI-NEXT: v_mov_b32_e32 v19, s22 -; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_writelane_b32 v32, s53, 13 +; SI-NEXT: v_readfirstlane_b32 s42, v19 ; SI-NEXT: v_mov_b32_e32 v19, s23 -; SI-NEXT: v_readfirstlane_b32 s47, v19 +; SI-NEXT: v_writelane_b32 v32, s54, 14 +; SI-NEXT: v_readfirstlane_b32 s43, v19 ; SI-NEXT: v_mov_b32_e32 v19, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v19 +; SI-NEXT: v_writelane_b32 v32, s55, 15 +; SI-NEXT: v_readfirstlane_b32 s40, v19 ; SI-NEXT: v_mov_b32_e32 v19, s25 -; SI-NEXT: v_readfirstlane_b32 s56, v19 +; SI-NEXT: v_writelane_b32 v32, s64, 16 +; SI-NEXT: v_readfirstlane_b32 s41, v19 ; SI-NEXT: v_mov_b32_e32 v19, s26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v63, s30, 0 -; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_writelane_b32 v32, s65, 17 +; SI-NEXT: v_readfirstlane_b32 s24, v19 ; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_writelane_b32 v63, s31, 1 -; SI-NEXT: v_readfirstlane_b32 s27, v19 +; SI-NEXT: v_writelane_b32 v32, s66, 18 +; SI-NEXT: v_readfirstlane_b32 s25, v19 ; SI-NEXT: v_mov_b32_e32 v19, s28 -; SI-NEXT: v_writelane_b32 v63, s34, 2 -; SI-NEXT: v_readfirstlane_b32 s26, v19 +; SI-NEXT: v_writelane_b32 v32, s67, 19 +; SI-NEXT: v_readfirstlane_b32 s22, v19 ; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_writelane_b32 v63, s35, 3 -; SI-NEXT: v_readfirstlane_b32 s28, v19 -; SI-NEXT: v_readfirstlane_b32 s22, v0 -; SI-NEXT: v_readfirstlane_b32 s23, v1 -; SI-NEXT: v_readfirstlane_b32 s20, v2 -; SI-NEXT: v_readfirstlane_b32 s21, v3 -; SI-NEXT: v_readfirstlane_b32 s18, v4 -; SI-NEXT: v_readfirstlane_b32 s19, v5 -; SI-NEXT: v_readfirstlane_b32 s16, v6 -; SI-NEXT: v_readfirstlane_b32 s17, v7 -; SI-NEXT: v_readfirstlane_b32 s14, v8 -; SI-NEXT: v_readfirstlane_b32 s15, v9 -; SI-NEXT: v_readfirstlane_b32 s12, v10 -; SI-NEXT: v_readfirstlane_b32 s13, v11 -; SI-NEXT: v_readfirstlane_b32 s10, v12 -; SI-NEXT: v_readfirstlane_b32 s11, v13 -; SI-NEXT: v_readfirstlane_b32 s7, v14 -; SI-NEXT: v_readfirstlane_b32 s8, v15 -; SI-NEXT: v_readfirstlane_b32 s6, v16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v17 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v32, s68, 20 +; SI-NEXT: v_readfirstlane_b32 s23, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v0 +; SI-NEXT: v_readfirstlane_b32 s21, v1 +; SI-NEXT: v_readfirstlane_b32 s18, v2 +; SI-NEXT: v_readfirstlane_b32 s19, v3 +; SI-NEXT: v_readfirstlane_b32 s16, v4 +; SI-NEXT: v_readfirstlane_b32 s17, v5 +; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_readfirstlane_b32 s15, v7 +; SI-NEXT: v_readfirstlane_b32 s12, v8 +; SI-NEXT: v_readfirstlane_b32 s13, v9 +; SI-NEXT: v_readfirstlane_b32 s10, v10 +; SI-NEXT: v_readfirstlane_b32 s11, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v12 +; SI-NEXT: v_readfirstlane_b32 s9, v13 +; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: v_readfirstlane_b32 s4, v16 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v17 +; SI-NEXT: v_writelane_b32 v32, s69, 21 ; SI-NEXT: s_cbranch_scc0 .LBB65_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s56, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s47, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s46, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s45, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s44, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s40 +; SI-NEXT: s_lshr_b32 s38, s5, 16 +; SI-NEXT: s_lshr_b32 s39, s7, 16 +; SI-NEXT: s_lshr_b32 s48, s9, 16 +; SI-NEXT: s_lshr_b32 s49, s11, 16 +; SI-NEXT: s_lshr_b32 s50, s13, 16 +; SI-NEXT: s_lshr_b32 s51, s15, 16 +; SI-NEXT: s_lshr_b32 s52, s17, 16 +; SI-NEXT: s_lshr_b32 s53, s19, 16 +; SI-NEXT: s_lshr_b32 s54, s21, 16 +; SI-NEXT: s_lshr_b32 s55, s23, 16 +; SI-NEXT: s_lshr_b32 s64, s25, 16 +; SI-NEXT: s_lshr_b32 s65, s41, 16 +; SI-NEXT: s_lshr_b32 s66, s43, 16 +; SI-NEXT: s_lshr_b32 s67, s45, 16 +; SI-NEXT: s_lshr_b32 s68, s47, 16 +; SI-NEXT: s_lshr_b32 s69, s57, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 ; SI-NEXT: s_cbranch_execnz .LBB65_3 ; SI-NEXT: .LBB65_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s40, 3 -; SI-NEXT: s_addc_u32 s5, s44, 0 -; SI-NEXT: s_lshr_b32 s29, s4, 16 -; SI-NEXT: s_lshr_b32 s40, s5, 16 -; SI-NEXT: s_add_u32 s41, s41, 3 -; SI-NEXT: s_addc_u32 s44, s45, 0 -; SI-NEXT: s_lshr_b32 s45, s41, 16 -; SI-NEXT: s_lshr_b32 s57, s44, 16 -; SI-NEXT: s_add_u32 s42, s42, 3 -; SI-NEXT: s_addc_u32 s46, s46, 0 -; SI-NEXT: s_lshr_b32 s58, s42, 16 -; SI-NEXT: s_lshr_b32 s59, s46, 16 -; SI-NEXT: s_add_u32 s43, s43, 3 -; SI-NEXT: s_addc_u32 s47, s47, 0 -; SI-NEXT: s_lshr_b32 s60, s43, 16 -; SI-NEXT: s_lshr_b32 s61, s47, 16 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s56, s56, 0 -; SI-NEXT: s_lshr_b32 s62, s24, 16 -; SI-NEXT: s_lshr_b32 s63, s56, 16 -; SI-NEXT: s_add_u32 s25, s25, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s72, s25, 16 -; SI-NEXT: s_lshr_b32 s73, s27, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s28, s28, 0 -; SI-NEXT: s_lshr_b32 s74, s26, 16 -; SI-NEXT: s_lshr_b32 s75, s28, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s76, s22, 16 -; SI-NEXT: s_lshr_b32 s77, s23, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s78, s20, 16 -; SI-NEXT: s_lshr_b32 s79, s21, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s88, s18, 16 -; SI-NEXT: s_lshr_b32 s89, s19, 16 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s90, s16, 16 -; SI-NEXT: s_lshr_b32 s91, s17, 16 -; SI-NEXT: s_add_u32 s14, s14, 3 -; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_lshr_b32 s92, s14, 16 -; SI-NEXT: s_lshr_b32 s93, s15, 16 -; SI-NEXT: s_add_u32 s12, s12, 3 -; SI-NEXT: s_addc_u32 s13, s13, 0 -; SI-NEXT: s_lshr_b32 s94, s12, 16 -; SI-NEXT: s_lshr_b32 s95, s13, 16 -; SI-NEXT: s_add_u32 s10, s10, 3 -; SI-NEXT: s_addc_u32 s11, s11, 0 -; SI-NEXT: s_lshr_b32 vcc_lo, s10, 16 -; SI-NEXT: s_lshr_b32 vcc_hi, s11, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_lshr_b32 s30, s7, 16 -; SI-NEXT: s_lshr_b32 s31, s8, 16 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_lshr_b32 s35, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s35 -; SI-NEXT: s_lshr_b32 s34, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s24 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v56, s47 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v57, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s46 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s42 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s44 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v61, s41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v62, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v31, s34 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s30 -; SI-NEXT: v_cvt_f32_f16_e32 v26, vcc_hi -; SI-NEXT: v_cvt_f32_f16_e32 v27, vcc_lo -; SI-NEXT: v_cvt_f32_f16_e32 v24, s95 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s94 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s29 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 +; SI-NEXT: s_add_u32 s44, s44, 3 +; SI-NEXT: s_addc_u32 s45, s45, 0 +; SI-NEXT: s_add_u32 s46, s46, 3 +; SI-NEXT: s_addc_u32 s47, s47, 0 +; SI-NEXT: s_add_u32 s56, s56, 3 +; SI-NEXT: s_addc_u32 s57, s57, 0 +; SI-NEXT: s_lshr_b32 s38, s5, 16 +; SI-NEXT: s_lshr_b32 s39, s7, 16 +; SI-NEXT: s_lshr_b32 s48, s9, 16 +; SI-NEXT: s_lshr_b32 s49, s11, 16 +; SI-NEXT: s_lshr_b32 s50, s13, 16 +; SI-NEXT: s_lshr_b32 s51, s15, 16 +; SI-NEXT: s_lshr_b32 s52, s17, 16 +; SI-NEXT: s_lshr_b32 s53, s19, 16 +; SI-NEXT: s_lshr_b32 s54, s21, 16 +; SI-NEXT: s_lshr_b32 s55, s23, 16 +; SI-NEXT: s_lshr_b32 s64, s25, 16 +; SI-NEXT: s_lshr_b32 s65, s41, 16 +; SI-NEXT: s_lshr_b32 s66, s43, 16 +; SI-NEXT: s_lshr_b32 s67, s45, 16 +; SI-NEXT: s_lshr_b32 s68, s47, 16 +; SI-NEXT: s_lshr_b32 s69, s57, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 ; SI-NEXT: .LBB65_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v62 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v2, v30, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v3, v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v58 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v59 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v59 -; SI-NEXT: v_or_b32_e32 v5, v5, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v8, v30, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v10, v30, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v12, v30, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_or_b32_e32 v14, v30, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_or_b32_e32 v16, v30, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_or_b32_e32 v18, v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_or_b32_e32 v20, v30, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v39 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v4, v58, v4 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v9, v46, v9 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v15, v40, v15 -; SI-NEXT: v_or_b32_e32 v24, v30, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v37, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v26, v30, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v35 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_or_b32_e32 v17, v54, v17 -; SI-NEXT: v_or_b32_e32 v19, v52, v19 -; SI-NEXT: v_or_b32_e32 v21, v50, v21 -; SI-NEXT: v_or_b32_e32 v23, v48, v23 -; SI-NEXT: v_or_b32_e32 v25, v38, v25 -; SI-NEXT: v_or_b32_e32 v27, v36, v27 -; SI-NEXT: v_or_b32_e32 v29, v34, v29 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s27, s36, 16 +; SI-NEXT: s_and_b32 s29, s56, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s57, 0xffff +; SI-NEXT: s_lshl_b32 s56, s69, 16 +; SI-NEXT: s_or_b32 s29, s29, s56 +; SI-NEXT: s_lshl_b32 s56, s34, 16 +; SI-NEXT: s_and_b32 s46, s46, 0xffff +; SI-NEXT: s_or_b32 s46, s46, s56 +; SI-NEXT: s_and_b32 s47, s47, 0xffff +; SI-NEXT: s_lshl_b32 s56, s68, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_lshl_b32 s56, s30, 16 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_or_b32 s44, s44, s56 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_lshl_b32 s56, s67, 16 +; SI-NEXT: s_or_b32 s45, s45, s56 +; SI-NEXT: s_lshl_b32 s56, s94, 16 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s42, s42, s56 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s56, s66, 16 +; SI-NEXT: s_or_b32 s43, s43, s56 +; SI-NEXT: s_lshl_b32 s56, s92, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s56 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s56, s65, 16 +; SI-NEXT: s_or_b32 s41, s41, s56 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s56, s90, 16 +; SI-NEXT: s_or_b32 s24, s24, s56 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s56, s64, 16 +; SI-NEXT: s_or_b32 s25, s25, s56 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s56, s88, 16 +; SI-NEXT: s_or_b32 s22, s22, s56 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s56, s55, 16 +; SI-NEXT: s_or_b32 s23, s23, s56 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s56, s78, 16 +; SI-NEXT: s_or_b32 s20, s20, s56 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s54, 16 +; SI-NEXT: s_or_b32 s21, s21, s56 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s56, s76, 16 +; SI-NEXT: s_or_b32 s18, s18, s56 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s56, s53, 16 +; SI-NEXT: s_or_b32 s19, s19, s56 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s56, s74, 16 +; SI-NEXT: s_or_b32 s16, s16, s56 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s56, s52, 16 +; SI-NEXT: s_or_b32 s17, s17, s56 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s56, s72, 16 +; SI-NEXT: s_or_b32 s14, s14, s56 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s56, s51, 16 +; SI-NEXT: s_or_b32 s15, s15, s56 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s56, s62, 16 +; SI-NEXT: s_or_b32 s12, s12, s56 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s56, s50, 16 +; SI-NEXT: s_or_b32 s13, s13, s56 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s56, s58, 16 +; SI-NEXT: s_or_b32 s10, s10, s56 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s56, s49, 16 +; SI-NEXT: s_or_b32 s11, s11, s56 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s56, s60, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s56 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s56, s48, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s39, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s38, 16 +; SI-NEXT: s_or_b32 s9, s9, s56 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s46 +; SI-NEXT: v_mov_b32_e32 v3, s47 +; SI-NEXT: v_mov_b32_e32 v4, s44 +; SI-NEXT: v_mov_b32_e32 v5, s45 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s40 +; SI-NEXT: v_mov_b32_e32 v9, s41 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_mov_b32_e32 v20, s14 +; SI-NEXT: v_mov_b32_e32 v21, s15 +; SI-NEXT: v_mov_b32_e32 v22, s12 +; SI-NEXT: v_mov_b32_e32 v23, s13 +; SI-NEXT: v_mov_b32_e32 v24, s10 +; SI-NEXT: v_mov_b32_e32 v25, s11 +; SI-NEXT: v_mov_b32_e32 v26, s8 +; SI-NEXT: v_mov_b32_e32 v27, s9 +; SI-NEXT: v_mov_b32_e32 v28, s6 +; SI-NEXT: v_mov_b32_e32 v29, s7 +; SI-NEXT: v_mov_b32_e32 v30, s4 +; SI-NEXT: v_mov_b32_e32 v31, s5 +; SI-NEXT: v_readlane_b32 s69, v32, 21 +; SI-NEXT: v_readlane_b32 s68, v32, 20 +; SI-NEXT: v_readlane_b32 s67, v32, 19 +; SI-NEXT: v_readlane_b32 s66, v32, 18 +; SI-NEXT: v_readlane_b32 s65, v32, 17 +; SI-NEXT: v_readlane_b32 s64, v32, 16 +; SI-NEXT: v_readlane_b32 s55, v32, 15 +; SI-NEXT: v_readlane_b32 s54, v32, 14 +; SI-NEXT: v_readlane_b32 s53, v32, 13 +; SI-NEXT: v_readlane_b32 s52, v32, 12 +; SI-NEXT: v_readlane_b32 s51, v32, 11 +; SI-NEXT: v_readlane_b32 s50, v32, 10 +; SI-NEXT: v_readlane_b32 s49, v32, 9 +; SI-NEXT: v_readlane_b32 s48, v32, 8 +; SI-NEXT: v_readlane_b32 s39, v32, 7 +; SI-NEXT: v_readlane_b32 s38, v32, 6 +; SI-NEXT: v_readlane_b32 s37, v32, 5 +; SI-NEXT: v_readlane_b32 s36, v32, 4 +; SI-NEXT: v_readlane_b32 s35, v32, 3 +; SI-NEXT: v_readlane_b32 s34, v32, 2 +; SI-NEXT: v_readlane_b32 s31, v32, 1 +; SI-NEXT: v_readlane_b32 s30, v32, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB65_4: -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB65_2 ; ; VI-LABEL: bitcast_v16i64_to_v64f16_scalar: @@ -104396,342 +101791,208 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v60, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_mov_b32_e32 v52, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_mov_b32_e32 v53, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_mov_b32_e32 v40, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_mov_b32_e32 v41, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_mov_b32_e32 v42, v12 +; SI-NEXT: v_mov_b32_e32 v43, v11 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_mov_b32_e32 v45, v9 +; SI-NEXT: v_mov_b32_e32 v46, v8 +; SI-NEXT: v_mov_b32_e32 v47, v7 +; SI-NEXT: v_mov_b32_e32 v56, v6 +; SI-NEXT: v_mov_b32_e32 v57, v5 +; SI-NEXT: v_mov_b32_e32 v58, v4 +; SI-NEXT: v_mov_b32_e32 v59, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v62 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_or_b32_e32 v18, v59, v18 -; SI-NEXT: v_or_b32_e32 v19, v57, v19 -; SI-NEXT: v_or_b32_e32 v20, v47, v20 -; SI-NEXT: v_or_b32_e32 v21, v45, v21 -; SI-NEXT: v_or_b32_e32 v22, v43, v22 -; SI-NEXT: v_or_b32_e32 v23, v41, v23 -; SI-NEXT: v_or_b32_e32 v24, v55, v24 -; SI-NEXT: v_or_b32_e32 v25, v53, v25 -; SI-NEXT: v_or_b32_e32 v26, v51, v26 -; SI-NEXT: v_or_b32_e32 v27, v49, v27 -; SI-NEXT: v_or_b32_e32 v28, v38, v28 -; SI-NEXT: v_or_b32_e32 v29, v36, v29 -; SI-NEXT: v_or_b32_e32 v30, v34, v30 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v33 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 @@ -104743,429 +102004,474 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v62 -; SI-NEXT: v_or_b32_e32 v17, v61, v17 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: .LBB66_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v43 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v38 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v46 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v45 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v44 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v43 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v42 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v53 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v51 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v62 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v56 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v46 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v45 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v49 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 ; SI-NEXT: v_or_b32_e32 v31, v33, v31 @@ -105428,8 +102734,6 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-LABEL: bitcast_v64f16_to_v16i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -105446,696 +102750,396 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v47, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_mov_b32_e32 v32, v17 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_mov_b32_e32 v41, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v40 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v57 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v0 -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB67_2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 -; SI-NEXT: v_or_b32_e32 v5, v11, v5 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v15, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_or_b32_e32 v10, v62, v10 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_or_b32_e32 v11, v35, v11 -; SI-NEXT: v_mov_b32_e32 v35, v59 -; SI-NEXT: v_or_b32_e32 v12, v59, v12 -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_or_b32_e32 v14, v47, v14 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_or_b32_e32 v15, v45, v15 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_or_b32_e32 v18, v40, v18 -; SI-NEXT: v_or_b32_e32 v19, v55, v19 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_or_b32_e32 v20, v54, v20 -; SI-NEXT: v_or_b32_e32 v21, v53, v21 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_or_b32_e32 v22, v52, v22 -; SI-NEXT: v_or_b32_e32 v23, v51, v23 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_or_b32_e32 v24, v50, v24 -; SI-NEXT: v_or_b32_e32 v25, v49, v25 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_or_b32_e32 v26, v48, v26 -; SI-NEXT: v_or_b32_e32 v27, v39, v27 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_or_b32_e32 v28, v38, v28 -; SI-NEXT: v_or_b32_e32 v29, v63, v29 -; SI-NEXT: v_mov_b32_e32 v63, v60 -; SI-NEXT: v_or_b32_e32 v30, v60, v30 -; SI-NEXT: v_or_b32_e32 v31, v57, v31 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v8, v17, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v42 -; SI-NEXT: v_or_b32_e32 v9, v36, v9 -; SI-NEXT: v_mov_b32_e32 v36, v34 -; SI-NEXT: v_or_b32_e32 v16, v43, v16 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_or_b32_e32 v17, v41, v17 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: s_branch .LBB67_3 -; SI-NEXT: .LBB67_2: -; SI-NEXT: v_mov_b32_e32 v36, v34 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v35, v59 -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v63, v60 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: .LBB67_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v60, v39 -; SI-NEXT: v_mov_b32_e32 v38, v49 -; SI-NEXT: v_mov_b32_e32 v39, v51 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v49, v55 -; SI-NEXT: v_mov_b32_e32 v50, v41 -; SI-NEXT: v_mov_b32_e32 v51, v43 -; SI-NEXT: v_mov_b32_e32 v52, v45 -; SI-NEXT: v_mov_b32_e32 v53, v47 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v35 -; SI-NEXT: v_mov_b32_e32 v34, v36 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_cbranch_vccnz .LBB67_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_mov_b32_e32 v47, v16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_mov_b32_e32 v46, v17 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_mov_b32_e32 v42, v21 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v13 +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v61, v11 +; SI-NEXT: v_mov_b32_e32 v60, v10 +; SI-NEXT: v_mov_b32_e32 v59, v9 +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 +; SI-NEXT: s_cbranch_execnz .LBB67_3 +; SI-NEXT: .LBB67_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v55 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v53 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v63 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v49 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v48 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v60 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v38 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v37 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v46 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v45 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v43 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -106148,33 +103152,25 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v59 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v58 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 @@ -106182,14 +103178,21 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: .LBB67_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -106206,8 +103209,27 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB67_4: +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_mov_b32_e32 v47, v16 +; SI-NEXT: v_mov_b32_e32 v46, v17 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v42, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v13 +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v61, v11 +; SI-NEXT: v_mov_b32_e32 v60, v10 +; SI-NEXT: v_mov_b32_e32 v59, v9 +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB67_2 ; ; VI-LABEL: bitcast_v64f16_to_v16i64_scalar: ; VI: ; %bb.0: @@ -136955,532 +133977,202 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB80_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 -; SI-NEXT: v_mov_b32_e32 v37, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v52, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v40, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v43, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v46, v3, v2, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v57, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v51, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v50, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v33 -; SI-NEXT: v_mov_b32_e32 v33, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_mov_b32_e32 v32, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v3 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB80_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB80_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 -; SI-NEXT: v_add_f64 v[32:33], v[10:11], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v31 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_alignbit_b32 v32, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v33, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v34, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v35, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v36, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v37, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v52, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v40, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v43, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v46, v3, v2, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v57, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 ; SI-NEXT: .LBB80_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v63 +; SI-NEXT: v_or_b32_e32 v2, v2, v46 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v43 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v40 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v60 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v46 +; SI-NEXT: v_or_b32_e32 v5, v5, v43 +; SI-NEXT: v_or_b32_e32 v7, v7, v40 +; SI-NEXT: v_or_b32_e32 v8, v8, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v59 +; SI-NEXT: v_or_b32_e32 v10, v10, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v58 +; SI-NEXT: v_or_b32_e32 v12, v12, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v56 +; SI-NEXT: v_or_b32_e32 v14, v14, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v47 +; SI-NEXT: v_or_b32_e32 v16, v16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v45 +; SI-NEXT: v_or_b32_e32 v18, v18, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v44 +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v42 +; SI-NEXT: v_or_b32_e32 v22, v22, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v47 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v45 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v41 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -137497,91 +134189,47 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v53 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v39 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v36 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v28, v50 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v55 +; SI-NEXT: v_or_b32_e32 v26, v26, v34 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v54 +; SI-NEXT: v_or_b32_e32 v28, v28, v33 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v52 +; SI-NEXT: v_or_b32_e32 v11, v11, v50 +; SI-NEXT: v_or_b32_e32 v13, v13, v49 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_or_b32_e32 v17, v17, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v21, v21, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v25, v25, v35 +; SI-NEXT: v_or_b32_e32 v27, v27, v34 +; SI-NEXT: v_or_b32_e32 v29, v29, v33 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v64f16: @@ -137709,21 +134357,21 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_mov_b32_e32 v18, s16 -; SI-NEXT: v_mov_b32_e32 v19, s17 -; SI-NEXT: v_mov_b32_e32 v20, s18 -; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: v_mov_b32_e32 v31, s17 +; SI-NEXT: v_mov_b32_e32 v28, s18 +; SI-NEXT: v_mov_b32_e32 v29, s19 ; SI-NEXT: v_mov_b32_e32 v26, s20 ; SI-NEXT: v_mov_b32_e32 v27, s21 -; SI-NEXT: v_mov_b32_e32 v30, s22 -; SI-NEXT: v_mov_b32_e32 v31, s23 -; SI-NEXT: v_mov_b32_e32 v28, s24 -; SI-NEXT: v_mov_b32_e32 v29, s25 -; SI-NEXT: v_mov_b32_e32 v24, s26 -; SI-NEXT: v_mov_b32_e32 v25, s27 +; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v25, s23 +; SI-NEXT: v_mov_b32_e32 v22, s24 +; SI-NEXT: v_mov_b32_e32 v23, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v21, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v22, s28 -; SI-NEXT: v_mov_b32_e32 v23, s29 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -137742,520 +134390,229 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB81_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v13 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v11 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v9 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v29 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v8 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v35 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v35 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 +; SI-NEXT: v_lshr_b64 v[42:43], v[16:17], 16 ; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[43:44], v[14:15], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v31 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v35 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v35 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v35 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v28 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v16 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v18 +; SI-NEXT: v_lshr_b64 v[32:33], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[44:45], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[45:46], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[56:57], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[57:58], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; SI-NEXT: v_lshr_b64 v[47:48], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[58:59], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[30:31], 16 ; SI-NEXT: s_cbranch_execnz .LBB81_3 ; SI-NEXT: .LBB81_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f64 v[39:40], v[24:25], 1.0 -; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[0:1], v[20:21], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v0 -; SI-NEXT: v_add_f64 v[0:1], v[18:19], 1.0 -; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshr_b64 v[42:43], v[16:17], 16 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[43:44], v[14:15], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v5 -; SI-NEXT: v_mov_b32_e32 v36, v16 -; SI-NEXT: v_mov_b32_e32 v35, v17 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[32:33], v[20:21], 16 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_lshr_b64 v[44:45], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[22:23], 16 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_lshr_b64 v[45:46], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[56:57], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[57:58], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[47:48], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[58:59], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v31 ; SI-NEXT: .LBB81_3: ; %end -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v61 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v50 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v45 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v55 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v58 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v51 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v35 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v36, v30, v36 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v52 +; SI-NEXT: v_or_b32_e32 v37, v30, v31 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v38 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v38, v28, v30 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 +; SI-NEXT: v_or_b32_e32 v39, v28, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v48, v26, v28 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_or_b32_e32 v49, v26, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v34 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v50, v24, v26 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v51 +; SI-NEXT: v_or_b32_e32 v51, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v52, v22, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_or_b32_e32 v53, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v54, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; SI-NEXT: v_or_b32_e32 v55, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v40, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v61 +; SI-NEXT: v_or_b32_e32 v41, v18, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v58 +; SI-NEXT: v_or_b32_e32 v32, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_or_b32_e32 v33, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_or_b32_e32 v34, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_or_b32_e32 v35, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_mov_b32_e32 v2, v38 +; SI-NEXT: v_mov_b32_e32 v3, v39 +; SI-NEXT: v_mov_b32_e32 v4, v48 +; SI-NEXT: v_mov_b32_e32 v5, v49 +; SI-NEXT: v_mov_b32_e32 v6, v50 +; SI-NEXT: v_mov_b32_e32 v7, v51 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_mov_b32_e32 v8, v52 +; SI-NEXT: v_mov_b32_e32 v9, v53 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v48 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_mov_b32_e32 v10, v54 +; SI-NEXT: v_mov_b32_e32 v11, v55 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v52 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_mov_b32_e32 v12, v40 +; SI-NEXT: v_mov_b32_e32 v13, v41 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v28, v41 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_mov_b32_e32 v14, v32 +; SI-NEXT: v_mov_b32_e32 v15, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -138272,111 +134629,57 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v36 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB81_4: -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_mov_b32_e32 v16, v34 +; SI-NEXT: v_mov_b32_e32 v17, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v36 +; SI-NEXT: v_mov_b32_e32 v1, v37 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB81_4: +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 ; SI-NEXT: s_branch .LBB81_2 ; ; VI-LABEL: bitcast_v16f64_to_v64f16_scalar: @@ -138593,342 +134896,208 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v60, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_mov_b32_e32 v52, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_mov_b32_e32 v53, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_mov_b32_e32 v40, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_mov_b32_e32 v41, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_mov_b32_e32 v42, v12 +; SI-NEXT: v_mov_b32_e32 v43, v11 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_mov_b32_e32 v45, v9 +; SI-NEXT: v_mov_b32_e32 v46, v8 +; SI-NEXT: v_mov_b32_e32 v47, v7 +; SI-NEXT: v_mov_b32_e32 v56, v6 +; SI-NEXT: v_mov_b32_e32 v57, v5 +; SI-NEXT: v_mov_b32_e32 v58, v4 +; SI-NEXT: v_mov_b32_e32 v59, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v62 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB82_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_or_b32_e32 v18, v59, v18 -; SI-NEXT: v_or_b32_e32 v19, v57, v19 -; SI-NEXT: v_or_b32_e32 v20, v47, v20 -; SI-NEXT: v_or_b32_e32 v21, v45, v21 -; SI-NEXT: v_or_b32_e32 v22, v43, v22 -; SI-NEXT: v_or_b32_e32 v23, v41, v23 -; SI-NEXT: v_or_b32_e32 v24, v55, v24 -; SI-NEXT: v_or_b32_e32 v25, v53, v25 -; SI-NEXT: v_or_b32_e32 v26, v51, v26 -; SI-NEXT: v_or_b32_e32 v27, v49, v27 -; SI-NEXT: v_or_b32_e32 v28, v38, v28 -; SI-NEXT: v_or_b32_e32 v29, v36, v29 -; SI-NEXT: v_or_b32_e32 v30, v34, v30 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v33 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 @@ -138940,429 +135109,474 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v62 -; SI-NEXT: v_or_b32_e32 v17, v61, v17 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: .LBB82_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB82_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v43 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v38 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v46 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v45 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v44 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v43 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v42 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v53 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v51 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v62 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v56 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v46 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v45 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v49 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 ; SI-NEXT: v_or_b32_e32 v31, v33, v31 @@ -139625,8 +135839,6 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-LABEL: bitcast_v64f16_to_v16f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -139643,696 +135855,396 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v47, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_mov_b32_e32 v32, v17 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_mov_b32_e32 v41, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v40 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v57 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v0 -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB83_2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 -; SI-NEXT: v_or_b32_e32 v5, v11, v5 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v15, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_or_b32_e32 v10, v62, v10 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_or_b32_e32 v11, v35, v11 -; SI-NEXT: v_mov_b32_e32 v35, v59 -; SI-NEXT: v_or_b32_e32 v12, v59, v12 -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_or_b32_e32 v14, v47, v14 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_or_b32_e32 v15, v45, v15 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_or_b32_e32 v18, v40, v18 -; SI-NEXT: v_or_b32_e32 v19, v55, v19 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_or_b32_e32 v20, v54, v20 -; SI-NEXT: v_or_b32_e32 v21, v53, v21 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_or_b32_e32 v22, v52, v22 -; SI-NEXT: v_or_b32_e32 v23, v51, v23 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_or_b32_e32 v24, v50, v24 -; SI-NEXT: v_or_b32_e32 v25, v49, v25 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_or_b32_e32 v26, v48, v26 -; SI-NEXT: v_or_b32_e32 v27, v39, v27 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_or_b32_e32 v28, v38, v28 -; SI-NEXT: v_or_b32_e32 v29, v63, v29 -; SI-NEXT: v_mov_b32_e32 v63, v60 -; SI-NEXT: v_or_b32_e32 v30, v60, v30 -; SI-NEXT: v_or_b32_e32 v31, v57, v31 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v8, v17, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v42 -; SI-NEXT: v_or_b32_e32 v9, v36, v9 -; SI-NEXT: v_mov_b32_e32 v36, v34 -; SI-NEXT: v_or_b32_e32 v16, v43, v16 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_or_b32_e32 v17, v41, v17 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: s_branch .LBB83_3 -; SI-NEXT: .LBB83_2: -; SI-NEXT: v_mov_b32_e32 v36, v34 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v35, v59 -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v63, v60 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: .LBB83_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v60, v39 -; SI-NEXT: v_mov_b32_e32 v38, v49 -; SI-NEXT: v_mov_b32_e32 v39, v51 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v49, v55 -; SI-NEXT: v_mov_b32_e32 v50, v41 -; SI-NEXT: v_mov_b32_e32 v51, v43 -; SI-NEXT: v_mov_b32_e32 v52, v45 -; SI-NEXT: v_mov_b32_e32 v53, v47 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v35 -; SI-NEXT: v_mov_b32_e32 v34, v36 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_cbranch_vccnz .LBB83_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_mov_b32_e32 v47, v16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_mov_b32_e32 v46, v17 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_mov_b32_e32 v42, v21 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v13 +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v61, v11 +; SI-NEXT: v_mov_b32_e32 v60, v10 +; SI-NEXT: v_mov_b32_e32 v59, v9 +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 +; SI-NEXT: s_cbranch_execnz .LBB83_3 +; SI-NEXT: .LBB83_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v55 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v53 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v63 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v49 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v48 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v60 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v38 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v37 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v46 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v45 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v43 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -140345,33 +136257,25 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v59 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v58 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 @@ -140379,14 +136283,21 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 -; SI-NEXT: .LBB83_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: .LBB83_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -140403,8 +136314,27 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB83_4: +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_mov_b32_e32 v47, v16 +; SI-NEXT: v_mov_b32_e32 v46, v17 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v42, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v13 +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v61, v11 +; SI-NEXT: v_mov_b32_e32 v60, v10 +; SI-NEXT: v_mov_b32_e32 v59, v9 +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB83_2 ; ; VI-LABEL: bitcast_v64f16_to_v16f64_scalar: ; VI: ; %bb.0: @@ -169474,1790 +165404,1999 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v57, v12 -; SI-NEXT: v_mov_b32_e32 v50, v0 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:388 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v59, v24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v16 +; SI-NEXT: v_mov_b32_e32 v61, v14 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v60, v6 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:388 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:184 -; SI-NEXT: v_mov_b32_e32 v44, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; SI-NEXT: v_mov_b32_e32 v60, v26 -; SI-NEXT: v_mov_b32_e32 v45, v20 -; SI-NEXT: v_mov_b32_e32 v56, v14 -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 -; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v55 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v21 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v19 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v27 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:196 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 -; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v34 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:228 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:260 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v33 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:232 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:344 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v12 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v2 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:332 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:336 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:376 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:348 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v15 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v1 -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:328 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:356 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:364 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:376 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v0 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:360 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:356 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v15 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB92_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 -; SI-NEXT: v_or_b32_e32 v0, v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v17, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v21, v0, v9 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v15, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v57 -; SI-NEXT: v_or_b32_e32 v63, v0, v13 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 -; SI-NEXT: v_or_b32_e32 v14, v0, v58 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v11, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v16, 0xff, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v45 -; SI-NEXT: v_or_b32_e32 v16, v16, v19 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v40 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v42 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v41 -; SI-NEXT: v_and_b32_e32 v33, 0xff, v53 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 -; SI-NEXT: v_or_b32_e32 v6, v6, v34 -; SI-NEXT: v_and_b32_e32 v34, 0xff, v31 -; SI-NEXT: v_and_b32_e32 v35, 0xff, v35 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v46 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v47 -; SI-NEXT: v_and_b32_e32 v31, 0xff, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v13, v0, v59 -; SI-NEXT: v_mov_b32_e32 v0, v61 -; SI-NEXT: v_mov_b32_e32 v61, v3 -; SI-NEXT: v_mov_b32_e32 v3, v23 -; SI-NEXT: v_or_b32_e32 v23, v18, v62 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v18, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v51, 0xff, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v48, 0xff, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v58 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v22, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v60 -; SI-NEXT: v_or_b32_e32 v24, v18, v27 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v28 -; SI-NEXT: v_or_b32_e32 v26, v18, v29 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v27, 0xff, v54 -; SI-NEXT: v_and_b32_e32 v29, 0xff, v51 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v19, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v25 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v60, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v25, v18, v1 +; SI-NEXT: v_or_b32_e32 v5, v1, v5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v1, v9 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v28, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v32, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v11, v10, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v7, v7, v1 +; SI-NEXT: v_or_b32_e32 v10, v1, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v1, v12 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v10, v10, v1 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v13, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v1, v14 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v1, v14 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v12, v1 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v15, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v1, v16 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v36, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v37, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v38 +; SI-NEXT: v_or_b32_e32 v16, v1, v16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v4, v4, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v19, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v39, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v48 +; SI-NEXT: v_or_b32_e32 v49, v1, v20 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v48, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v49, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xff, v55 +; SI-NEXT: v_or_b32_e32 v20, v1, v20 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v18, v18, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v27, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v21, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v33, v33, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v29, v29, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v50, v1, v22 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v1, v22 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v23, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v52, v1, v24 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v1, v24 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v34, v34, v1 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v35, v35, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v27, v27, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v8, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v9, v9, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v53, v1, v28 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v50, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v28, v1, v28 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v50, v50, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v51, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v29, v29, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v51, v51, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v52, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v54, v1, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v52, v52, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v53, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v30, v1, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v53, v53, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v33, v31, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v5, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v54, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v55, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v54, v54, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v55, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v34, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v55, v55, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v35, v31, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v40, v40, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v41, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v41, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v41, v41, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v36, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v42, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v37, v31, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v42, v42, v1 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v42, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v38, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v39, v31, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v44, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v43, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v45, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v43, v43, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v63, v31, v63 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v44, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v26, v26, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v44, v44, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v7, v7, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v45, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v32, v31, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v45, v45, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v46, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v46, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v46, v56, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v46, v46, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v31, v31, v40 +; SI-NEXT: v_and_b32_e32 v40, 0xff, v59 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v9 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v48, v48, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v47, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v51, v51, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v47, v47, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v56, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v56, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v56, v56, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v57, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v57, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v57, v57, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v58, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v58, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v58, v58, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v62, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v59, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v59, v59, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v60, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v60, v60, v61 +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v61, 0xff, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v61, v61, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v62, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v6, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v62, v62, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v31, v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_and_b32_e32 v40, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v40, v40, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v60, v43, v47 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v47, v43, v47 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v61, v43, v59 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v43, v43, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v59, 0xff, v59 +; SI-NEXT: v_or_b32_e32 v59, v59, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v5 +; SI-NEXT: v_alignbit_b32 v9, v8, v9, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v9, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v48, v9, v12 +; SI-NEXT: v_alignbit_b32 v9, v11, v12, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v12, v9, v17 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v12, v14, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v51, v9, v25 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v15, v9, v16 +; SI-NEXT: v_alignbit_b32 v9, v51, v16, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v16, v13, v20 +; SI-NEXT: v_alignbit_b32 v13, v9, v20, 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v31, v13, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v58 +; SI-NEXT: v_or_b32_e32 v13, v13, v22 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v13, v31, v22, 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v13, v13, v52 +; SI-NEXT: v_or_b32_e32 v14, v14, v24 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v14, v13, v24, 16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v24, v14, v53 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v14, v28 +; SI-NEXT: v_alignbit_b32 v14, v24, v28, 16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v21, v14, v54 +; SI-NEXT: v_or_b32_e32 v22, v6, v30 +; SI-NEXT: v_alignbit_b32 v6, v21, v30, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v19, v6, v55 +; SI-NEXT: v_or_b32_e32 v29, v1, v34 +; SI-NEXT: v_alignbit_b32 v1, v19, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v23, v1, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v20, v1, v36 +; SI-NEXT: v_alignbit_b32 v1, v23, v36, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v30, v1, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: v_or_b32_e32 v27, v1, v38 +; SI-NEXT: v_alignbit_b32 v1, v30, v38, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v28, v1, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v34, v1, v45 +; SI-NEXT: v_alignbit_b32 v1, v28, v45, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v33, v1, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v36, v1, v7 +; SI-NEXT: v_alignbit_b32 v1, v33, v7, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v35, v1, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v38, v1, v46 +; SI-NEXT: v_alignbit_b32 v1, v35, v46, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v37, v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_or_b32_e32 v39, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v37, v3, 16 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: .LBB92_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB92_4 -; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB92_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_or_b32_e32 v2, v58, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_mov_b32_e32 v33, v46 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v33, 0xff, v33 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v35, 0xff, v35 -; SI-NEXT: v_mov_b32_e32 v63, v50 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v31 -; SI-NEXT: v_and_b32_e32 v50, 0xff, v50 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; SI-NEXT: v_and_b32_e32 v51, 0xff, v51 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v53, 0xff, v53 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: v_and_b32_e32 v54, 0xff, v54 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; SI-NEXT: v_and_b32_e32 v55, 0xff, v55 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: v_and_b32_e32 v48, 0xff, v48 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v44, v28 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v39, 0xff, v39 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: v_and_b32_e32 v38, 0xff, v38 -; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: v_and_b32_e32 v37, 0xff, v37 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_and_b32_e32 v36, 0xff, v36 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; SI-NEXT: v_and_b32_e32 v63, 0xff, v63 -; SI-NEXT: v_or_b32_e32 v63, v22, v63 -; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_or_b32_e32 v56, v58, v56 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 -; SI-NEXT: v_or_b32_e32 v52, v34, v52 -; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v52 -; SI-NEXT: v_add_i32_e32 v63, vcc, s6, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v11, v1, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v37, vcc, s7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x300, v11 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v15, v61, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v10 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v61 -; SI-NEXT: v_and_b32_e32 v58, 0xff, v58 -; SI-NEXT: v_or_b32_e32 v58, v14, v58 -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v17, v3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v3, v2 -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v21, v1, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v1, v5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x300, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v39, vcc, s7, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v56, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v38, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v46, v4 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v35 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v36, vcc, s7, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v63, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v33, vcc, s7, v5 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v34, vcc, s7, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v16, v1, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v13 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_mov_b32_e32 v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v44 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v46 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 -; SI-NEXT: v_or_b32_e32 v46, v19, v46 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v57, v0, v57 -; SI-NEXT: v_and_b32_e32 v41, 0xff, v41 -; SI-NEXT: v_or_b32_e32 v41, v29, v41 -; SI-NEXT: v_and_b32_e32 v45, 0xff, v45 -; SI-NEXT: v_or_b32_e32 v45, v62, v45 -; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v62, 0xff, v62 -; SI-NEXT: v_or_b32_e32 v62, v26, v62 -; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v44, 0xff, v44 -; SI-NEXT: v_and_b32_e32 v61, 0xff, v61 -; SI-NEXT: v_or_b32_e32 v61, v5, v61 -; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v41 -; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v45 -; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v46 -; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v57 -; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v61 -; SI-NEXT: v_add_i32_e32 v62, vcc, s6, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_or_b32_e32 v2, v2, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v30, vcc, s7, v9 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v11 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v12 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v23, v2, v13 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v47 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v47, 0xff, v47 -; SI-NEXT: v_or_b32_e32 v47, v59, v47 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v59, 0xff, v59 -; SI-NEXT: v_or_b32_e32 v59, v9, v59 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v47 -; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v13, v2, v13 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v33, v2, v33 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v33 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v35, v2, v35 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v35 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v50, v2, v50 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v50 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v51, v2, v51 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v51 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v53, v2, v53 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v53 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v54, v2, v54 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v54 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v55, v2, v55 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v55 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v49, v2, v49 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v49 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v48, v2, v48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v48 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v39, v2, v39 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v39 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v17 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v2, v38 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v38 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v4, v2, v4 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v42 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v2, v10, v2 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v60 -; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v60, 0xff, v60 -; SI-NEXT: v_or_b32_e32 v60, v20, v60 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 -; SI-NEXT: v_or_b32_e32 v42, v27, v42 -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v42 -; SI-NEXT: v_add_i32_e32 v60, vcc, s6, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v37, v10, v37 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v37 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v36, v10, v36 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v36 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v18 +; SI-NEXT: v_mov_b32_e32 v18, v5 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v12, v10, v12 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v40 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v10, v28, v10 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v49 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v32, v28, v32 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v25, v40, v25 -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v40, 0xff, v40 -; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 -; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: v_or_b32_e32 v28, v31, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v32 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v43, v24, v43 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v43 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v40, v30, v40 -; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v44, v24, v44 -; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v44 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v25, v20, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v35 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 -; SI-NEXT: v_mov_b32_e32 v11, v25 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v26, v20, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v26 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v23 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v54 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v53 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v31, v20, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v15 +; SI-NEXT: v_add_i32_e32 v31, vcc, s7, v31 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_mov_b32_e32 v16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v32, v20, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v48, v20, v19 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v25 +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v10 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v51, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v48, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v0 +; SI-NEXT: v_alignbit_b32 v0, v11, v48, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v12, v6, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v51, v15, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v9, v16, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v31, v3, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v13, v4, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v24, v5, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v21, v22, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v19, v29, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v23, v20, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v30, v27, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: v_mov_b32_e32 v21, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v28, v34, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_mov_b32_e32 v17, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v33, v36, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_mov_b32_e32 v15, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v35, v38, 16 +; SI-NEXT: v_alignbit_b32 v1, v8, v7, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v37, v39, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill ; SI-NEXT: .LBB92_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v63 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v45 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v49 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v52 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v29 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v55 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v18, v18, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v44 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload @@ -171274,217 +167413,22 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v36 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v30 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v29, v32, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_or_b32_e32 v30, v32, v30 -; SI-NEXT: v_or_b32_e32 v31, v34, v31 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v64f16: @@ -175609,1586 +171553,1635 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328 -; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:300 +; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_writelane_b32 v63, s30, 0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_writelane_b32 v61, s29, 0 -; SI-NEXT: v_writelane_b32 v61, s28, 1 -; SI-NEXT: v_writelane_b32 v61, s27, 2 -; SI-NEXT: v_writelane_b32 v61, s26, 3 -; SI-NEXT: v_writelane_b32 v61, s25, 4 -; SI-NEXT: v_writelane_b32 v61, s24, 5 -; SI-NEXT: v_writelane_b32 v61, s23, 6 -; SI-NEXT: v_writelane_b32 v61, s22, 7 -; SI-NEXT: v_writelane_b32 v61, s21, 8 -; SI-NEXT: v_writelane_b32 v61, s20, 9 -; SI-NEXT: v_writelane_b32 v63, s31, 1 -; SI-NEXT: v_writelane_b32 v63, s34, 2 -; SI-NEXT: v_writelane_b32 v63, s35, 3 -; SI-NEXT: v_writelane_b32 v63, s36, 4 -; SI-NEXT: v_writelane_b32 v63, s37, 5 -; SI-NEXT: v_writelane_b32 v63, s38, 6 -; SI-NEXT: v_writelane_b32 v63, s39, 7 -; SI-NEXT: v_writelane_b32 v63, s48, 8 -; SI-NEXT: v_writelane_b32 v63, s49, 9 -; SI-NEXT: v_writelane_b32 v63, s50, 10 -; SI-NEXT: v_writelane_b32 v63, s51, 11 -; SI-NEXT: v_writelane_b32 v63, s52, 12 -; SI-NEXT: v_writelane_b32 v63, s53, 13 -; SI-NEXT: v_writelane_b32 v63, s54, 14 -; SI-NEXT: v_writelane_b32 v63, s55, 15 -; SI-NEXT: v_writelane_b32 v63, s64, 16 -; SI-NEXT: v_writelane_b32 v63, s65, 17 -; SI-NEXT: v_writelane_b32 v63, s66, 18 -; SI-NEXT: v_writelane_b32 v63, s67, 19 -; SI-NEXT: v_writelane_b32 v63, s68, 20 -; SI-NEXT: v_writelane_b32 v63, s69, 21 -; SI-NEXT: v_writelane_b32 v63, s70, 22 -; SI-NEXT: v_writelane_b32 v63, s71, 23 -; SI-NEXT: v_writelane_b32 v63, s80, 24 -; SI-NEXT: v_writelane_b32 v63, s81, 25 -; SI-NEXT: v_writelane_b32 v63, s82, 26 -; SI-NEXT: v_writelane_b32 v63, s83, 27 -; SI-NEXT: v_writelane_b32 v63, s84, 28 -; SI-NEXT: v_writelane_b32 v63, s85, 29 -; SI-NEXT: v_writelane_b32 v63, s86, 30 -; SI-NEXT: v_writelane_b32 v63, s87, 31 -; SI-NEXT: v_writelane_b32 v63, s96, 32 -; SI-NEXT: v_writelane_b32 v63, s97, 33 -; SI-NEXT: v_writelane_b32 v63, s98, 34 -; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: s_mov_b32 s61, s19 -; SI-NEXT: s_mov_b32 s62, s17 -; SI-NEXT: s_mov_b32 s73, s18 -; SI-NEXT: s_mov_b32 s10, s16 -; SI-NEXT: v_readfirstlane_b32 s35, v0 -; SI-NEXT: v_readfirstlane_b32 s12, v27 -; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; SI-NEXT: v_readfirstlane_b32 s41, v26 +; SI-NEXT: v_writelane_b32 v41, s30, 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v62, s12, 0 -; SI-NEXT: v_readfirstlane_b32 s46, v29 -; SI-NEXT: v_writelane_b32 v62, s41, 1 -; SI-NEXT: v_readfirstlane_b32 s56, v28 -; SI-NEXT: v_writelane_b32 v62, s46, 2 -; SI-NEXT: v_writelane_b32 v62, s56, 3 -; SI-NEXT: v_readfirstlane_b32 s77, v30 -; SI-NEXT: v_readfirstlane_b32 s96, v3 -; SI-NEXT: v_readfirstlane_b32 s6, v2 -; SI-NEXT: v_readfirstlane_b32 s38, v4 -; SI-NEXT: v_readfirstlane_b32 s94, v7 -; SI-NEXT: v_readfirstlane_b32 s90, v6 -; SI-NEXT: v_readfirstlane_b32 s91, v9 -; SI-NEXT: v_readfirstlane_b32 s98, v8 -; SI-NEXT: v_readfirstlane_b32 s93, v11 -; SI-NEXT: v_readfirstlane_b32 s20, v10 -; SI-NEXT: v_readfirstlane_b32 s24, v13 -; SI-NEXT: v_readfirstlane_b32 s27, v12 -; SI-NEXT: v_readfirstlane_b32 s8, v15 -; SI-NEXT: v_readfirstlane_b32 s9, v14 -; SI-NEXT: v_readfirstlane_b32 s78, v17 -; SI-NEXT: v_readfirstlane_b32 s14, v16 -; SI-NEXT: v_readfirstlane_b32 s40, v19 -; SI-NEXT: v_readfirstlane_b32 s42, v18 -; SI-NEXT: v_readfirstlane_b32 s43, v21 -; SI-NEXT: v_readfirstlane_b32 s44, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 -; SI-NEXT: v_writelane_b32 v61, s4, 10 -; SI-NEXT: v_readfirstlane_b32 s88, v23 -; SI-NEXT: v_readfirstlane_b32 s37, v22 -; SI-NEXT: v_readfirstlane_b32 s28, v25 -; SI-NEXT: v_readfirstlane_b32 s7, v24 -; SI-NEXT: v_readfirstlane_b32 s31, v5 -; SI-NEXT: v_readfirstlane_b32 s87, v1 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:320 -; SI-NEXT: v_writelane_b32 v61, s4, 11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 -; SI-NEXT: v_writelane_b32 v61, s4, 12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312 -; SI-NEXT: v_writelane_b32 v61, s4, 13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:308 -; SI-NEXT: v_writelane_b32 v61, s4, 14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:304 -; SI-NEXT: v_writelane_b32 v61, s4, 15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 -; SI-NEXT: v_writelane_b32 v61, s4, 16 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_writelane_b32 v43, s29, 0 +; SI-NEXT: v_writelane_b32 v43, s28, 1 +; SI-NEXT: v_writelane_b32 v43, s27, 2 +; SI-NEXT: v_writelane_b32 v43, s26, 3 +; SI-NEXT: v_writelane_b32 v43, s25, 4 +; SI-NEXT: v_writelane_b32 v43, s24, 5 +; SI-NEXT: v_writelane_b32 v43, s23, 6 +; SI-NEXT: v_writelane_b32 v43, s22, 7 +; SI-NEXT: v_writelane_b32 v43, s21, 8 +; SI-NEXT: v_writelane_b32 v43, s20, 9 +; SI-NEXT: v_writelane_b32 v43, s19, 10 +; SI-NEXT: v_writelane_b32 v43, s18, 11 +; SI-NEXT: v_writelane_b32 v43, s17, 12 +; SI-NEXT: v_writelane_b32 v41, s31, 1 +; SI-NEXT: v_writelane_b32 v41, s34, 2 +; SI-NEXT: v_writelane_b32 v41, s35, 3 +; SI-NEXT: v_writelane_b32 v41, s36, 4 +; SI-NEXT: v_writelane_b32 v41, s37, 5 +; SI-NEXT: v_writelane_b32 v41, s38, 6 +; SI-NEXT: v_writelane_b32 v41, s39, 7 +; SI-NEXT: v_writelane_b32 v41, s48, 8 +; SI-NEXT: v_writelane_b32 v41, s49, 9 +; SI-NEXT: v_writelane_b32 v41, s50, 10 +; SI-NEXT: v_writelane_b32 v41, s51, 11 +; SI-NEXT: v_writelane_b32 v41, s52, 12 +; SI-NEXT: v_writelane_b32 v41, s53, 13 +; SI-NEXT: v_writelane_b32 v41, s54, 14 +; SI-NEXT: v_writelane_b32 v41, s55, 15 +; SI-NEXT: v_writelane_b32 v41, s64, 16 +; SI-NEXT: v_writelane_b32 v41, s65, 17 +; SI-NEXT: v_writelane_b32 v41, s66, 18 +; SI-NEXT: v_writelane_b32 v41, s67, 19 +; SI-NEXT: v_writelane_b32 v41, s68, 20 +; SI-NEXT: v_writelane_b32 v41, s69, 21 +; SI-NEXT: v_writelane_b32 v41, s70, 22 +; SI-NEXT: v_writelane_b32 v41, s71, 23 +; SI-NEXT: v_writelane_b32 v41, s80, 24 +; SI-NEXT: v_writelane_b32 v41, s81, 25 +; SI-NEXT: v_writelane_b32 v41, s82, 26 +; SI-NEXT: v_writelane_b32 v41, s83, 27 +; SI-NEXT: v_writelane_b32 v41, s84, 28 +; SI-NEXT: v_writelane_b32 v41, s85, 29 +; SI-NEXT: v_writelane_b32 v41, s86, 30 +; SI-NEXT: v_writelane_b32 v41, s87, 31 +; SI-NEXT: v_writelane_b32 v41, s96, 32 +; SI-NEXT: v_writelane_b32 v41, s97, 33 +; SI-NEXT: v_writelane_b32 v41, s98, 34 +; SI-NEXT: s_mov_b32 s22, s16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 +; SI-NEXT: v_readfirstlane_b32 s56, v11 +; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s57, v10 +; SI-NEXT: v_writelane_b32 v42, s56, 0 +; SI-NEXT: v_readfirstlane_b32 s49, v23 +; SI-NEXT: v_writelane_b32 v42, s57, 1 +; SI-NEXT: v_readfirstlane_b32 s50, v22 +; SI-NEXT: v_writelane_b32 v42, s49, 2 +; SI-NEXT: v_readfirstlane_b32 s51, v20 +; SI-NEXT: v_writelane_b32 v42, s50, 3 +; SI-NEXT: v_readfirstlane_b32 s52, v21 +; SI-NEXT: v_writelane_b32 v42, s51, 4 +; SI-NEXT: v_writelane_b32 v42, s52, 5 +; SI-NEXT: v_readfirstlane_b32 s58, v19 +; SI-NEXT: v_readfirstlane_b32 s59, v18 +; SI-NEXT: v_readfirstlane_b32 s64, v30 +; SI-NEXT: v_readfirstlane_b32 s65, v28 +; SI-NEXT: v_readfirstlane_b32 s66, v29 +; SI-NEXT: v_readfirstlane_b32 s60, v27 +; SI-NEXT: v_readfirstlane_b32 s61, v26 +; SI-NEXT: v_readfirstlane_b32 s12, v0 +; SI-NEXT: v_readfirstlane_b32 s13, v1 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 -; SI-NEXT: v_writelane_b32 v61, s4, 17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 -; SI-NEXT: v_writelane_b32 v61, s4, 18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 -; SI-NEXT: v_writelane_b32 v61, s4, 19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284 -; SI-NEXT: v_writelane_b32 v61, s4, 20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:280 -; SI-NEXT: v_writelane_b32 v61, s4, 21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 -; SI-NEXT: v_writelane_b32 v61, s4, 22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 -; SI-NEXT: v_writelane_b32 v61, s4, 23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268 -; SI-NEXT: v_writelane_b32 v61, s4, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 -; SI-NEXT: v_writelane_b32 v61, s4, 25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 -; SI-NEXT: v_writelane_b32 v61, s4, 26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256 -; SI-NEXT: v_writelane_b32 v61, s4, 27 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:276 +; SI-NEXT: v_writelane_b32 v43, s4, 13 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_writelane_b32 v43, s4, 14 +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: v_writelane_b32 v43, s4, 15 +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 +; SI-NEXT: v_writelane_b32 v43, s4, 16 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_writelane_b32 v43, s4, 17 +; SI-NEXT: v_readfirstlane_b32 s44, v36 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:256 +; SI-NEXT: v_readfirstlane_b32 s6, v37 +; SI-NEXT: v_readfirstlane_b32 s7, v38 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s40, v7 +; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_readfirstlane_b32 s42, v4 +; SI-NEXT: v_readfirstlane_b32 s43, v5 +; SI-NEXT: v_readfirstlane_b32 s76, v16 +; SI-NEXT: v_readfirstlane_b32 s77, v17 +; SI-NEXT: v_readfirstlane_b32 s46, v3 +; SI-NEXT: v_readfirstlane_b32 s47, v2 +; SI-NEXT: v_readfirstlane_b32 s78, v15 +; SI-NEXT: v_readfirstlane_b32 s38, v13 +; SI-NEXT: v_readfirstlane_b32 s39, v24 +; SI-NEXT: v_writelane_b32 v41, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s48, v25 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s99, v54 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s88, v40 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_writelane_b32 v43, s4, 18 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_writelane_b32 v43, s4, 19 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: v_writelane_b32 v43, s4, 20 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: v_writelane_b32 v43, s4, 21 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: v_writelane_b32 v43, s4, 22 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 -; SI-NEXT: v_writelane_b32 v61, s4, 28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248 -; SI-NEXT: v_writelane_b32 v61, s4, 29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:244 -; SI-NEXT: v_writelane_b32 v61, s4, 30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240 -; SI-NEXT: v_writelane_b32 v61, s4, 31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:236 -; SI-NEXT: v_writelane_b32 v61, s4, 32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 -; SI-NEXT: v_writelane_b32 v61, s4, 33 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:228 -; SI-NEXT: v_writelane_b32 v61, s4, 34 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 -; SI-NEXT: v_writelane_b32 v61, s4, 35 -; SI-NEXT: v_writelane_b32 v61, s62, 36 -; SI-NEXT: v_writelane_b32 v61, s10, 37 -; SI-NEXT: v_writelane_b32 v61, s61, 38 -; SI-NEXT: v_writelane_b32 v61, s73, 39 -; SI-NEXT: v_writelane_b32 v61, s35, 40 -; SI-NEXT: v_writelane_b32 v61, s96, 41 -; SI-NEXT: v_writelane_b32 v61, s6, 42 -; SI-NEXT: v_writelane_b32 v61, s38, 43 -; SI-NEXT: v_writelane_b32 v61, s94, 44 -; SI-NEXT: v_writelane_b32 v61, s90, 45 -; SI-NEXT: v_writelane_b32 v61, s91, 46 -; SI-NEXT: v_writelane_b32 v61, s98, 47 -; SI-NEXT: v_writelane_b32 v61, s93, 48 -; SI-NEXT: v_writelane_b32 v61, s20, 49 -; SI-NEXT: v_writelane_b32 v61, s24, 50 -; SI-NEXT: v_writelane_b32 v61, s27, 51 -; SI-NEXT: v_writelane_b32 v61, s8, 52 -; SI-NEXT: v_writelane_b32 v61, s9, 53 -; SI-NEXT: v_writelane_b32 v61, s78, 54 -; SI-NEXT: v_writelane_b32 v61, s14, 55 -; SI-NEXT: v_writelane_b32 v61, s40, 56 -; SI-NEXT: v_writelane_b32 v61, s42, 57 -; SI-NEXT: v_writelane_b32 v61, s43, 58 -; SI-NEXT: v_writelane_b32 v61, s44, 59 -; SI-NEXT: v_writelane_b32 v61, s88, 60 -; SI-NEXT: v_writelane_b32 v61, s37, 61 -; SI-NEXT: v_writelane_b32 v61, s28, 62 -; SI-NEXT: v_writelane_b32 v61, s7, 63 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s99, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s95, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s68, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s89, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s76, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s36, v31 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:224 +; SI-NEXT: v_writelane_b32 v43, s4, 23 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s8, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s9, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:216 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_writelane_b32 v43, s4, 24 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_writelane_b32 v43, s4, 25 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_writelane_b32 v43, s4, 26 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s91, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: v_writelane_b32 v43, s4, 27 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v43, s4, 28 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s93, v39 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: v_readfirstlane_b32 s80, v48 +; SI-NEXT: v_readfirstlane_b32 s82, v49 +; SI-NEXT: v_writelane_b32 v43, s4, 29 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s48, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s53, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s45, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s81, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s66, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s19, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:176 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s69, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s97, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:168 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s25, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:164 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s85, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s26, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s13, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s11, v31 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:172 +; SI-NEXT: v_writelane_b32 v43, s4, 30 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s79, v32 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s83, v33 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s36, v34 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s10, v35 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s11, v36 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s57, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s47, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s92, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s59, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s34, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s30, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s50, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s39, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s54, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s15, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s17, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s18, v31 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 +; SI-NEXT: v_writelane_b32 v43, s4, 31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: v_writelane_b32 v43, s4, 32 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s98, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s90, v48 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s89, v49 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s95, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s81, v51 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 +; SI-NEXT: v_writelane_b32 v43, s4, 33 +; SI-NEXT: v_readfirstlane_b32 s4, v52 +; SI-NEXT: v_writelane_b32 v43, s4, 34 +; SI-NEXT: v_readfirstlane_b32 s4, v53 +; SI-NEXT: v_writelane_b32 v43, s4, 35 +; SI-NEXT: v_readfirstlane_b32 s4, v55 +; SI-NEXT: v_writelane_b32 v43, s4, 36 +; SI-NEXT: v_writelane_b32 v43, s44, 37 +; SI-NEXT: v_writelane_b32 v43, s6, 38 +; SI-NEXT: v_writelane_b32 v43, s83, 39 +; SI-NEXT: v_writelane_b32 v43, s7, 40 +; SI-NEXT: v_writelane_b32 v43, s8, 41 +; SI-NEXT: v_writelane_b32 v43, s36, 42 +; SI-NEXT: v_writelane_b32 v43, s9, 43 +; SI-NEXT: v_writelane_b32 v43, s10, 44 +; SI-NEXT: v_writelane_b32 v43, s11, 45 +; SI-NEXT: v_writelane_b32 v43, s12, 46 +; SI-NEXT: v_writelane_b32 v43, s13, 47 +; SI-NEXT: v_writelane_b32 v43, s14, 48 +; SI-NEXT: v_writelane_b32 v43, s15, 49 +; SI-NEXT: v_writelane_b32 v43, s40, 50 +; SI-NEXT: v_writelane_b32 v43, s41, 51 +; SI-NEXT: v_writelane_b32 v43, s42, 52 +; SI-NEXT: v_writelane_b32 v43, s43, 53 +; SI-NEXT: v_writelane_b32 v43, s76, 54 +; SI-NEXT: v_writelane_b32 v43, s77, 55 +; SI-NEXT: v_writelane_b32 v43, s46, 56 +; SI-NEXT: v_writelane_b32 v43, s47, 57 +; SI-NEXT: v_writelane_b32 v43, s78, 58 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s97, v31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s28, v32 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s29, v33 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s92, v35 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s84, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s17, v37 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s64, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s52, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s49, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s65, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s67, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s71, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s80, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s70, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s94, v38 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s21, v39 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s24, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s16, v50 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s34, v51 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 vcc_lo, v14 +; SI-NEXT: v_readfirstlane_b32 vcc_hi, v12 +; SI-NEXT: v_writelane_b32 v43, vcc_lo, 59 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 60 +; SI-NEXT: v_writelane_b32 v43, s38, 61 +; SI-NEXT: v_writelane_b32 v43, s39, 62 +; SI-NEXT: v_writelane_b32 v43, s48, 63 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s86, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s84, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s63, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s83, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s74, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; SI-NEXT: v_readfirstlane_b32 s25, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s23, v32 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s96, v33 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s35, v35 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s31, v36 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s72, v37 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s20, v49 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s18, v50 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s19, v51 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s75, v34 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_readfirstlane_b32 s67, v38 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s71, v39 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s51, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s22, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s55, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s60, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s82, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s72, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s75, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s23, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s79, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s21, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s16, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s29, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: v_readfirstlane_b32 s30, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 +; SI-NEXT: v_readfirstlane_b32 s73, v31 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s74, v32 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s70, v33 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s69, v34 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s68, v35 +; SI-NEXT: v_readfirstlane_b32 s54, v49 +; SI-NEXT: v_readfirstlane_b32 s53, v50 +; SI-NEXT: v_writelane_b32 v42, s53, 6 +; SI-NEXT: v_writelane_b32 v42, s54, 7 +; SI-NEXT: v_writelane_b32 v42, s58, 8 +; SI-NEXT: v_readfirstlane_b32 s55, v51 +; SI-NEXT: v_writelane_b32 v42, s59, 9 +; SI-NEXT: v_writelane_b32 v42, s55, 10 +; SI-NEXT: v_writelane_b32 v42, s64, 11 +; SI-NEXT: v_writelane_b32 v42, s65, 12 +; SI-NEXT: v_writelane_b32 v42, s66, 13 +; SI-NEXT: v_writelane_b32 v42, s67, 14 +; SI-NEXT: v_writelane_b32 v42, s69, 15 +; SI-NEXT: v_writelane_b32 v42, s70, 16 +; SI-NEXT: v_writelane_b32 v42, s71, 17 +; SI-NEXT: v_writelane_b32 v42, s60, 18 +; SI-NEXT: v_writelane_b32 v42, s61, 19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s85, v36 +; SI-NEXT: v_writelane_b32 v42, s68, 20 +; SI-NEXT: v_writelane_b32 v42, s85, 21 +; SI-NEXT: v_writelane_b32 v42, s30, 22 +; SI-NEXT: v_writelane_b32 v42, s34, 23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s37, v38 +; SI-NEXT: v_writelane_b32 v42, s86, 24 +; SI-NEXT: v_readfirstlane_b32 s87, v37 +; SI-NEXT: v_writelane_b32 v42, s37, 25 +; SI-NEXT: v_writelane_b32 v42, s87, 26 +; SI-NEXT: v_writelane_b32 v42, s20, 27 +; SI-NEXT: v_writelane_b32 v42, s84, 28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s58, v31 -; SI-NEXT: v_writelane_b32 v62, s58, 4 -; SI-NEXT: v_writelane_b32 v62, s16, 5 -; SI-NEXT: v_writelane_b32 v62, s77, 6 -; SI-NEXT: v_writelane_b32 v62, s79, 7 -; SI-NEXT: v_writelane_b32 v62, s29, 8 -; SI-NEXT: v_writelane_b32 v62, s75, 9 -; SI-NEXT: v_writelane_b32 v62, s21, 10 -; SI-NEXT: v_writelane_b32 v62, s23, 11 -; SI-NEXT: v_writelane_b32 v62, s17, 12 -; SI-NEXT: v_writelane_b32 v62, s18, 13 -; SI-NEXT: v_writelane_b32 v62, s52, 14 -; SI-NEXT: v_writelane_b32 v62, s65, 15 -; SI-NEXT: v_writelane_b32 v62, s64, 16 -; SI-NEXT: v_writelane_b32 v62, s49, 17 -; SI-NEXT: v_writelane_b32 v62, s67, 18 -; SI-NEXT: v_writelane_b32 v62, s71, 19 -; SI-NEXT: v_writelane_b32 v62, s70, 20 -; SI-NEXT: v_writelane_b32 v62, s84, 21 -; SI-NEXT: v_writelane_b32 v62, s80, 22 -; SI-NEXT: v_writelane_b32 v62, s83, 23 -; SI-NEXT: v_writelane_b32 v62, s51, 24 -; SI-NEXT: v_writelane_b32 v62, s82, 25 -; SI-NEXT: v_writelane_b32 v62, s55, 26 -; SI-NEXT: v_writelane_b32 v62, s86, 27 -; SI-NEXT: v_writelane_b32 v62, s63, 28 -; SI-NEXT: v_writelane_b32 v62, s74, 29 -; SI-NEXT: v_writelane_b32 v62, s72, 30 -; SI-NEXT: v_writelane_b32 v62, s22, 31 -; SI-NEXT: v_writelane_b32 v62, s60, 32 +; SI-NEXT: v_readfirstlane_b32 s62, v39 +; SI-NEXT: v_writelane_b32 v42, s92, 29 +; SI-NEXT: v_writelane_b32 v42, s62, 30 +; SI-NEXT: v_readfirstlane_b32 s63, v48 +; SI-NEXT: v_writelane_b32 v42, s23, 31 +; SI-NEXT: v_writelane_b32 v42, s63, 32 +; SI-NEXT: v_writelane_b32 v42, s96, 33 +; SI-NEXT: v_writelane_b32 v42, s17, 34 +; SI-NEXT: v_writelane_b32 v42, s18, 35 +; SI-NEXT: v_writelane_b32 v42, s94, 36 +; SI-NEXT: v_writelane_b32 v42, s19, 37 +; SI-NEXT: v_writelane_b32 v42, s31, 38 +; SI-NEXT: v_writelane_b32 v42, s35, 39 +; SI-NEXT: v_writelane_b32 v42, s24, 40 +; SI-NEXT: v_writelane_b32 v42, s21, 41 +; SI-NEXT: v_writelane_b32 v42, s72, 42 +; SI-NEXT: v_writelane_b32 v42, s73, 43 +; SI-NEXT: v_writelane_b32 v42, s74, 44 +; SI-NEXT: v_writelane_b32 v42, s75, 45 +; SI-NEXT: v_writelane_b32 v42, s25, 46 +; SI-NEXT: v_writelane_b32 v42, s16, 47 +; SI-NEXT: v_writelane_b32 v42, s97, 48 +; SI-NEXT: v_writelane_b32 v42, s28, 49 +; SI-NEXT: v_writelane_b32 v42, s29, 50 ; SI-NEXT: s_cbranch_scc0 .LBB93_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s62, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s73, 0xff -; SI-NEXT: s_lshl_b32 s5, s61, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 9 -; SI-NEXT: v_readlane_b32 s5, v61, 8 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 7 -; SI-NEXT: v_readlane_b32 s5, v61, 6 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 5 -; SI-NEXT: v_readlane_b32 s5, v61, 4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 3 -; SI-NEXT: v_readlane_b32 s5, v61, 2 -; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_readlane_b32 s5, v43, 12 +; SI-NEXT: s_and_b32 s4, s22, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 1 -; SI-NEXT: v_readlane_b32 s5, v61, 0 +; SI-NEXT: v_writelane_b32 v42, s4, 51 +; SI-NEXT: v_readlane_b32 s4, v43, 5 +; SI-NEXT: v_readlane_b32 s5, v43, 4 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s35, 0xff -; SI-NEXT: s_lshl_b32 s5, s87, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s96, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s38, 0xff -; SI-NEXT: s_lshl_b32 s5, s31, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s90, 0xff -; SI-NEXT: s_lshl_b32 s5, s94, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s98, 0xff -; SI-NEXT: s_lshl_b32 s5, s91, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s93, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s27, 0xff -; SI-NEXT: s_lshl_b32 s5, s24, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s5, s8, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: s_lshl_b32 s5, s78, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s42, 0xff -; SI-NEXT: s_lshl_b32 s5, s40, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s44, 0xff -; SI-NEXT: s_lshl_b32 s5, s43, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_and_b32 s4, s37, 0xff -; SI-NEXT: s_lshl_b32 s5, s88, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: s_lshl_b32 s5, s28, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xff -; SI-NEXT: s_lshl_b32 s5, s12, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_and_b32 s4, s56, 0xff +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s47, 0xff ; SI-NEXT: s_lshl_b32 s5, s46, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_and_b32 s4, s77, 0xff +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s57, 0xff +; SI-NEXT: s_lshl_b32 s5, s56, 8 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_and_b32 s4, s59, 0xff ; SI-NEXT: s_lshl_b32 s5, s58, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_and_b32 s4, s29, 0xff -; SI-NEXT: s_lshl_b32 s5, s16, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_and_b32 s4, s21, 0xff -; SI-NEXT: s_lshl_b32 s5, s79, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_and_b32 s4, s23, 0xff -; SI-NEXT: s_lshl_b32 s5, s75, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_and_b32 s4, s72, 0xff -; SI-NEXT: s_lshl_b32 s5, s82, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_and_b32 s4, s60, 0xff -; SI-NEXT: s_lshl_b32 s5, s55, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s51, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_and_b32 s4, s74, 0xff -; SI-NEXT: s_lshl_b32 s5, s83, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_or_b32 s56, s4, s5 +; SI-NEXT: s_and_b32 s4, s61, 0xff +; SI-NEXT: s_lshl_b32 s5, s60, 8 +; SI-NEXT: s_or_b32 s57, s4, s5 ; SI-NEXT: s_and_b32 s4, s63, 0xff -; SI-NEXT: s_lshl_b32 s5, s84, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s86, 0xff -; SI-NEXT: s_lshl_b32 s5, s70, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_and_b32 s4, s80, 0xff -; SI-NEXT: s_lshl_b32 s5, s71, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_and_b32 s4, s67, 0xff -; SI-NEXT: s_lshl_b32 s5, s65, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_and_b32 s4, s49, 0xff -; SI-NEXT: s_lshl_b32 s5, s52, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_and_b32 s4, s64, 0xff -; SI-NEXT: s_lshl_b32 s5, s18, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s5, s15, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_and_b32 s4, s54, 0xff -; SI-NEXT: s_lshl_b32 s5, s39, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_and_b32 s4, s50, 0xff -; SI-NEXT: s_lshl_b32 s5, s30, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: s_and_b32 s4, s34, 0xff -; SI-NEXT: s_lshl_b32 s5, s59, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: s_and_b32 s4, s92, 0xff -; SI-NEXT: s_lshl_b32 s5, s47, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_and_b32 s4, s57, 0xff -; SI-NEXT: s_lshl_b32 s5, s11, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: s_lshl_b32 s5, s26, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_and_b32 s4, s85, 0xff +; SI-NEXT: s_lshl_b32 s5, s62, 8 +; SI-NEXT: s_or_b32 s58, s4, s5 +; SI-NEXT: s_and_b32 s4, s74, 0xff +; SI-NEXT: s_lshl_b32 s5, s73, 8 +; SI-NEXT: s_or_b32 s59, s4, s5 +; SI-NEXT: s_and_b32 s4, s75, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_or_b32 s60, s4, s5 +; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_and_b32 s4, s97, 0xff -; SI-NEXT: s_lshl_b32 s5, s69, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_and_b32 s4, s19, 0xff -; SI-NEXT: s_lshl_b32 s5, s66, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_or_b32 s61, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s28, 8 +; SI-NEXT: s_or_b32 s62, s4, s5 ; SI-NEXT: s_and_b32 s4, s81, 0xff -; SI-NEXT: s_lshl_b32 s5, s45, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_and_b32 s4, s53, 0xff -; SI-NEXT: s_lshl_b32 s5, s48, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: s_and_b32 s4, s36, 0xff -; SI-NEXT: s_lshl_b32 s5, s76, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: s_and_b32 s4, s89, 0xff -; SI-NEXT: s_lshl_b32 s5, s68, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_and_b32 s4, s95, 0xff -; SI-NEXT: s_lshl_b32 s5, s99, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 35 -; SI-NEXT: v_readlane_b32 s5, v61, 34 -; SI-NEXT: s_mov_b32 s6, s99 -; SI-NEXT: s_mov_b32 s99, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s96, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 33 -; SI-NEXT: v_readlane_b32 s5, v61, 32 -; SI-NEXT: s_mov_b32 s55, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s86, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 31 -; SI-NEXT: v_readlane_b32 s5, v61, 30 -; SI-NEXT: s_mov_b32 s35, s87 -; SI-NEXT: s_mov_b32 s82, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s87, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 29 -; SI-NEXT: v_readlane_b32 s5, v61, 28 -; SI-NEXT: s_mov_b32 s83, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s51, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 27 -; SI-NEXT: v_readlane_b32 s5, v61, 26 -; SI-NEXT: s_mov_b32 s84, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s80, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 25 -; SI-NEXT: v_readlane_b32 s5, v61, 24 -; SI-NEXT: s_mov_b32 s71, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s70, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 23 -; SI-NEXT: v_readlane_b32 s5, v61, 22 -; SI-NEXT: s_mov_b32 s49, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s67, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 21 -; SI-NEXT: v_readlane_b32 s5, v61, 20 -; SI-NEXT: s_mov_b32 s65, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s64, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 19 -; SI-NEXT: v_readlane_b32 s5, v61, 18 -; SI-NEXT: s_mov_b32 s17, s15 -; SI-NEXT: s_mov_b32 s18, s54 -; SI-NEXT: s_mov_b32 s15, s50 -; SI-NEXT: s_mov_b32 s54, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s50, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 17 -; SI-NEXT: v_readlane_b32 s5, v61, 16 -; SI-NEXT: s_mov_b32 s23, s34 -; SI-NEXT: s_mov_b32 s14, s48 -; SI-NEXT: s_mov_b32 s34, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s48, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 15 -; SI-NEXT: v_readlane_b32 s5, v61, 14 -; SI-NEXT: s_mov_b32 s52, s4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s75, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 13 -; SI-NEXT: v_readlane_b32 s5, v61, 12 -; SI-NEXT: s_mov_b32 s29, s30 -; SI-NEXT: s_mov_b32 s79, s92 -; SI-NEXT: s_mov_b32 s30, s4 +; SI-NEXT: s_lshl_b32 s5, s95, 8 +; SI-NEXT: s_or_b32 s63, s4, s5 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: s_or_b32 s72, s4, s5 +; SI-NEXT: s_and_b32 s4, s80, 0xff +; SI-NEXT: s_lshl_b32 s5, s93, 8 +; SI-NEXT: s_or_b32 s73, s4, s5 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: s_or_b32 s74, s4, s5 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_or_b32 s75, s4, s5 +; SI-NEXT: v_readlane_b32 s4, v43, 9 +; SI-NEXT: v_readlane_b32 s5, v43, 8 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s92, s5 ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 11 -; SI-NEXT: v_readlane_b32 s5, v61, 10 -; SI-NEXT: s_mov_b32 s21, s39 -; SI-NEXT: s_mov_b32 s39, s4 +; SI-NEXT: s_or_b32 s5, s4, s5 +; SI-NEXT: v_readlane_b32 s4, v43, 7 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_mov_b32 s77, s5 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_mov_b32 s38, s31 -; SI-NEXT: s_mov_b32 s16, s59 -; SI-NEXT: s_mov_b32 s56, s47 -; SI-NEXT: s_mov_b32 s58, s57 -; SI-NEXT: s_mov_b32 s12, s11 -; SI-NEXT: s_mov_b32 s41, s13 -; SI-NEXT: s_mov_b32 s28, s26 -; SI-NEXT: s_mov_b32 s7, s85 -; SI-NEXT: s_mov_b32 s26, s25 -; SI-NEXT: s_mov_b32 s85, s97 -; SI-NEXT: s_mov_b32 s25, s69 -; SI-NEXT: s_mov_b32 s97, s19 -; SI-NEXT: s_mov_b32 s37, s66 -; SI-NEXT: s_mov_b32 s69, s81 -; SI-NEXT: s_mov_b32 s44, s45 -; SI-NEXT: s_mov_b32 s66, s53 -; SI-NEXT: s_mov_b32 s53, s36 -; SI-NEXT: s_mov_b32 s98, s76 -; SI-NEXT: s_mov_b32 s36, s89 -; SI-NEXT: s_mov_b32 s90, s68 -; SI-NEXT: s_mov_b32 s89, s95 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_cbranch_execnz .LBB93_3 -; SI-NEXT: .LBB93_2: ; %cmp.true -; SI-NEXT: s_add_i32 s4, s39, 3 +; SI-NEXT: v_readlane_b32 s6, v43, 6 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_or_b32 s7, s6, s4 +; SI-NEXT: v_readlane_b32 s4, v43, 11 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s77, 8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s5, s30, 3 -; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: s_lshl_b32 vcc_lo, s92, 8 -; SI-NEXT: s_or_b32 s5, vcc_lo, s5 -; SI-NEXT: s_add_i32 vcc_lo, s52, 3 -; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff -; SI-NEXT: s_lshl_b32 vcc_hi, s75, 8 -; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo -; SI-NEXT: s_add_i32 vcc_hi, s34, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s60, s48, 8 -; SI-NEXT: s_or_b32 s60, s60, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s54, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s61, s50, 8 -; SI-NEXT: s_or_b32 s61, s61, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s65, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s62, s64, 8 -; SI-NEXT: s_or_b32 s62, s62, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s49, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s63, s67, 8 -; SI-NEXT: s_or_b32 s10, s63, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s71, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s72, s70, 8 -; SI-NEXT: s_or_b32 s72, s72, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s84, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s73, s80, 8 -; SI-NEXT: s_or_b32 s73, s73, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s83, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s74, s51, 8 -; SI-NEXT: s_or_b32 s74, s74, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s82, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s75, s87, 8 -; SI-NEXT: s_or_b32 s75, s75, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s55, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s76, s86, 8 -; SI-NEXT: s_or_b32 s76, s76, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s99, 3 -; SI-NEXT: s_add_i32 s95, s36, 3 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s77, s96, 8 -; SI-NEXT: s_add_i32 s89, s89, 3 -; SI-NEXT: s_and_b32 s95, s95, 0xff -; SI-NEXT: s_lshl_b32 s88, s90, 8 -; SI-NEXT: s_add_i32 s36, s53, 3 -; SI-NEXT: s_or_b32 s77, s77, vcc_hi -; SI-NEXT: s_and_b32 s89, s89, 0xff -; SI-NEXT: s_lshl_b32 vcc_hi, s6, 8 -; SI-NEXT: s_or_b32 s22, s88, s95 -; SI-NEXT: s_and_b32 s95, s36, 0xff -; SI-NEXT: s_lshl_b32 s92, s98, 8 -; SI-NEXT: s_add_i32 s53, s66, 3 -; SI-NEXT: s_or_b32 s89, vcc_hi, s89 -; SI-NEXT: s_or_b32 s92, s92, s95 -; SI-NEXT: s_and_b32 s95, s53, 0xff -; SI-NEXT: s_lshl_b32 vcc_hi, s14, 8 -; SI-NEXT: s_add_i32 s66, s69, 3 -; SI-NEXT: s_or_b32 s95, vcc_hi, s95 -; SI-NEXT: s_and_b32 vcc_hi, s66, 0xff -; SI-NEXT: s_lshl_b32 s36, s44, 8 -; SI-NEXT: s_add_i32 s68, s97, 3 -; SI-NEXT: s_or_b32 vcc_hi, s36, vcc_hi -; SI-NEXT: s_and_b32 s36, s68, 0xff -; SI-NEXT: s_lshl_b32 s39, s37, 8 -; SI-NEXT: s_add_i32 s69, s85, 3 -; SI-NEXT: s_or_b32 s36, s39, s36 -; SI-NEXT: s_and_b32 s39, s69, 0xff -; SI-NEXT: s_lshl_b32 s52, s25, 8 -; SI-NEXT: s_add_i32 s81, s7, 3 -; SI-NEXT: s_or_b32 s39, s52, s39 -; SI-NEXT: s_and_b32 s52, s81, 0xff -; SI-NEXT: s_lshl_b32 s53, s26, 8 -; SI-NEXT: s_add_i32 s85, s41, 3 -; SI-NEXT: s_or_b32 s52, s53, s52 -; SI-NEXT: s_and_b32 s53, s85, 0xff -; SI-NEXT: s_lshl_b32 s64, s28, 8 -; SI-NEXT: s_add_i32 s97, s58, 3 -; SI-NEXT: s_or_b32 s53, s64, s53 -; SI-NEXT: s_and_b32 s64, s97, 0xff -; SI-NEXT: s_lshl_b32 s66, s12, 8 -; SI-NEXT: s_add_i32 s19, s79, 3 -; SI-NEXT: s_or_b32 s64, s66, s64 -; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s66, s56, 8 -; SI-NEXT: s_add_i32 s25, s23, 3 -; SI-NEXT: s_or_b32 s66, s66, s19 -; SI-NEXT: s_and_b32 s19, s25, 0xff -; SI-NEXT: s_lshl_b32 s6, s16, 8 -; SI-NEXT: s_add_i32 s26, s15, 3 -; SI-NEXT: s_or_b32 s67, s6, s19 -; SI-NEXT: s_and_b32 s6, s26, 0xff -; SI-NEXT: s_lshl_b32 s19, s29, 8 -; SI-NEXT: s_add_i32 s28, s18, 3 -; SI-NEXT: s_or_b32 s68, s19, s6 -; SI-NEXT: s_and_b32 s6, s28, 0xff -; SI-NEXT: s_lshl_b32 s19, s21, 8 -; SI-NEXT: s_or_b32 s69, s19, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 12 -; SI-NEXT: v_readlane_b32 s16, v62, 11 -; SI-NEXT: s_add_i32 s7, s6, 3 -; SI-NEXT: s_add_i32 s27, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 9 -; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: s_lshl_b32 s7, s17, 8 -; SI-NEXT: s_lshl_b32 s23, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 10 -; SI-NEXT: s_or_b32 s70, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 16 -; SI-NEXT: s_add_i32 s24, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 7 -; SI-NEXT: s_add_i32 s11, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 13 -; SI-NEXT: s_lshl_b32 s18, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 8 -; SI-NEXT: s_and_b32 s6, s11, 0xff +; SI-NEXT: v_readlane_b32 s6, v43, 10 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_writelane_b32 v42, s7, 52 +; SI-NEXT: s_or_b32 s4, s6, s4 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: v_readlane_b32 s6, v43, 1 +; SI-NEXT: v_readlane_b32 s7, v43, 0 +; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_add_i32 s20, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 5 -; SI-NEXT: s_or_b32 s71, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 17 -; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s17, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 6 -; SI-NEXT: s_add_i32 s12, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 14 -; SI-NEXT: s_or_b32 s17, s17, s20 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s20, v62, 4 +; SI-NEXT: s_or_b32 s7, s6, s7 ; SI-NEXT: s_and_b32 s6, s12, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s20, s20, 8 -; SI-NEXT: s_or_b32 s81, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 18 -; SI-NEXT: s_and_b32 s24, s24, 0xff -; SI-NEXT: s_or_b32 s16, s20, s16 -; SI-NEXT: v_readlane_b32 s20, v62, 3 -; SI-NEXT: s_add_i32 s13, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 15 -; SI-NEXT: s_or_b32 s18, s18, s24 -; SI-NEXT: s_add_i32 s98, s20, 3 -; SI-NEXT: v_readlane_b32 s24, v62, 2 -; SI-NEXT: s_and_b32 s6, s13, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s20, s98, 0xff -; SI-NEXT: s_lshl_b32 s24, s24, 8 -; SI-NEXT: s_or_b32 s83, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 22 -; SI-NEXT: s_and_b32 s27, s27, 0xff -; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v62, 1 -; SI-NEXT: s_add_i32 s41, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 19 -; SI-NEXT: s_or_b32 s23, s23, s27 -; SI-NEXT: s_add_i32 s86, s24, 3 -; SI-NEXT: v_readlane_b32 s27, v62, 0 -; SI-NEXT: s_and_b32 s6, s41, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s24, s86, 0xff -; SI-NEXT: s_lshl_b32 s27, s27, 8 -; SI-NEXT: s_or_b32 s85, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 27 -; SI-NEXT: s_or_b32 s24, s27, s24 -; SI-NEXT: v_readlane_b32 s27, v61, 63 -; SI-NEXT: s_add_i32 s46, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 20 -; SI-NEXT: s_add_i32 s11, s72, 0x300 -; SI-NEXT: s_add_i32 s82, s27, 3 -; SI-NEXT: v_readlane_b32 s72, v61, 62 -; SI-NEXT: s_and_b32 s6, s46, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s27, s82, 0xff -; SI-NEXT: s_lshl_b32 s72, s72, 8 -; SI-NEXT: s_or_b32 s96, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 28 -; SI-NEXT: s_or_b32 s27, s72, s27 -; SI-NEXT: v_readlane_b32 s72, v61, 61 -; SI-NEXT: s_add_i32 s47, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 21 -; SI-NEXT: s_add_i32 s12, s73, 0x300 -; SI-NEXT: s_add_i32 s65, s72, 3 -; SI-NEXT: v_readlane_b32 s73, v61, 60 -; SI-NEXT: s_and_b32 s6, s47, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s72, s65, 0xff -; SI-NEXT: s_lshl_b32 s73, s73, 8 -; SI-NEXT: s_or_b32 s97, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 29 -; SI-NEXT: s_or_b32 s72, s73, s72 -; SI-NEXT: v_readlane_b32 s73, v61, 59 -; SI-NEXT: s_add_i32 s56, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 23 -; SI-NEXT: s_add_i32 s13, s74, 0x300 -; SI-NEXT: s_add_i32 s54, s73, 3 -; SI-NEXT: v_readlane_b32 s74, v61, 58 -; SI-NEXT: s_and_b32 s6, s56, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s73, s54, 0xff -; SI-NEXT: s_lshl_b32 s74, s74, 8 -; SI-NEXT: s_or_b32 s63, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 31 -; SI-NEXT: s_or_b32 s73, s74, s73 -; SI-NEXT: v_readlane_b32 s74, v61, 57 -; SI-NEXT: s_add_i32 s58, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 24 -; SI-NEXT: s_add_i32 s15, s76, 0x300 -; SI-NEXT: s_add_i32 s50, s74, 3 -; SI-NEXT: v_readlane_b32 s76, v61, 56 -; SI-NEXT: s_and_b32 s6, s58, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s74, s50, 0xff -; SI-NEXT: s_lshl_b32 s76, s76, 8 -; SI-NEXT: s_or_b32 s78, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 32 -; SI-NEXT: s_or_b32 s74, s76, s74 -; SI-NEXT: v_readlane_b32 s76, v61, 55 -; SI-NEXT: s_add_i32 s59, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 26 -; SI-NEXT: s_add_i32 s19, s77, 0x300 -; SI-NEXT: s_add_i32 s48, s76, 3 -; SI-NEXT: v_readlane_b32 s77, v61, 54 -; SI-NEXT: s_and_b32 s6, s59, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s76, s48, 0xff -; SI-NEXT: s_lshl_b32 s77, s77, 8 -; SI-NEXT: s_or_b32 s88, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 30 -; SI-NEXT: s_or_b32 s76, s77, s76 -; SI-NEXT: v_readlane_b32 s77, v61, 53 -; SI-NEXT: s_add_i32 s57, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 25 -; SI-NEXT: s_add_i32 s14, s75, 0x300 -; SI-NEXT: s_add_i32 s75, s78, 0x300 -; SI-NEXT: s_add_i32 s37, s77, 3 -; SI-NEXT: v_readlane_b32 s78, v61, 52 -; SI-NEXT: s_and_b32 s6, s57, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s77, s37, 0xff -; SI-NEXT: s_lshl_b32 s78, s78, 8 -; SI-NEXT: s_or_b32 s79, s7, s6 -; SI-NEXT: s_or_b32 s77, s78, s77 -; SI-NEXT: v_readlane_b32 s78, v61, 51 -; SI-NEXT: s_add_i32 s21, s89, 0x300 -; SI-NEXT: s_add_i32 s89, s79, 0x300 -; SI-NEXT: s_add_i32 s34, s78, 3 -; SI-NEXT: v_readlane_b32 s79, v61, 50 -; SI-NEXT: s_and_b32 s78, s34, 0xff -; SI-NEXT: s_lshl_b32 s79, s79, 8 -; SI-NEXT: s_or_b32 s78, s79, s78 -; SI-NEXT: v_readlane_b32 s79, v61, 49 -; SI-NEXT: v_readlane_b32 s90, v61, 48 -; SI-NEXT: s_add_i32 s25, s92, 0x300 -; SI-NEXT: s_add_i32 s30, s79, 3 -; SI-NEXT: s_lshl_b32 s92, s90, 8 -; SI-NEXT: v_readlane_b32 s90, v61, 47 -; SI-NEXT: s_and_b32 s79, s30, 0xff -; SI-NEXT: s_add_i32 s93, s90, 3 -; SI-NEXT: v_readlane_b32 s90, v61, 46 -; SI-NEXT: s_or_b32 s79, s92, s79 -; SI-NEXT: s_and_b32 s92, s93, 0xff -; SI-NEXT: s_lshl_b32 s91, s90, 8 -; SI-NEXT: v_readlane_b32 s90, v61, 45 -; SI-NEXT: s_or_b32 s91, s91, s92 -; SI-NEXT: s_add_i32 s90, s90, 3 -; SI-NEXT: v_readlane_b32 s92, v61, 44 -; SI-NEXT: s_and_b32 s90, s90, 0xff -; SI-NEXT: s_lshl_b32 s92, s92, 8 -; SI-NEXT: s_or_b32 s90, s92, s90 -; SI-NEXT: v_readlane_b32 s92, v61, 43 -; SI-NEXT: s_add_i32 s92, s92, 3 -; SI-NEXT: s_and_b32 s92, s92, 0xff -; SI-NEXT: s_lshl_b32 s93, s38, 8 -; SI-NEXT: s_or_b32 s92, s93, s92 -; SI-NEXT: v_readlane_b32 s93, v61, 42 -; SI-NEXT: s_add_i32 s93, s93, 3 -; SI-NEXT: v_readlane_b32 s94, v61, 41 -; SI-NEXT: s_and_b32 s93, s93, 0xff -; SI-NEXT: s_lshl_b32 s94, s94, 8 -; SI-NEXT: s_or_b32 s93, s94, s93 -; SI-NEXT: v_readlane_b32 s94, v61, 40 -; SI-NEXT: s_add_i32 s94, s94, 3 -; SI-NEXT: s_add_i32 s26, s95, 0x300 -; SI-NEXT: s_and_b32 s94, s94, 0xff -; SI-NEXT: s_lshl_b32 s95, s35, 8 -; SI-NEXT: s_or_b32 s94, s95, s94 -; SI-NEXT: v_readlane_b32 s95, v61, 1 -; SI-NEXT: s_add_i32 s95, s95, 3 -; SI-NEXT: v_readlane_b32 s30, v61, 0 -; SI-NEXT: s_add_i32 s6, vcc_lo, 0x300 -; SI-NEXT: s_and_b32 s95, s95, 0xff -; SI-NEXT: s_lshl_b32 vcc_lo, s30, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 3 -; SI-NEXT: s_or_b32 s95, vcc_lo, s95 -; SI-NEXT: s_add_i32 vcc_lo, s30, 3 -; SI-NEXT: v_readlane_b32 s30, v61, 2 -; SI-NEXT: s_add_i32 s28, vcc_hi, 0x300 -; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff -; SI-NEXT: s_lshl_b32 vcc_hi, s30, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 5 -; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo -; SI-NEXT: s_add_i32 vcc_hi, s30, 3 -; SI-NEXT: v_readlane_b32 s30, v61, 4 -; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s30, s30, 8 -; SI-NEXT: s_or_b32 vcc_hi, s30, vcc_hi -; SI-NEXT: v_readlane_b32 s30, v61, 7 -; SI-NEXT: s_addk_i32 vcc_hi, 0x300 -; SI-NEXT: s_add_i32 s30, s30, 3 -; SI-NEXT: v_readlane_b32 s31, v61, 6 -; SI-NEXT: s_and_b32 s30, s30, 0xff -; SI-NEXT: s_lshl_b32 s31, s31, 8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, vcc_hi -; SI-NEXT: s_or_b32 s30, s31, s30 -; SI-NEXT: v_readlane_b32 s31, v61, 9 -; SI-NEXT: s_add_i32 s31, s31, 3 -; SI-NEXT: v_readlane_b32 s34, v61, 8 -; SI-NEXT: s_addk_i32 vcc_lo, 0x300 -; SI-NEXT: s_and_b32 s31, s31, 0xff -; SI-NEXT: s_lshl_b32 s34, s34, 8 -; SI-NEXT: s_or_b32 s31, s34, s31 -; SI-NEXT: v_readlane_b32 s34, v61, 39 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, vcc_lo -; SI-NEXT: s_add_i32 s34, s34, 3 -; SI-NEXT: v_readlane_b32 s35, v61, 38 -; SI-NEXT: s_and_b32 s34, s34, 0xff -; SI-NEXT: s_lshl_b32 s35, s35, 8 -; SI-NEXT: s_addk_i32 s95, 0x300 -; SI-NEXT: s_or_b32 s34, s35, s34 -; SI-NEXT: v_readlane_b32 s35, v61, 37 -; SI-NEXT: s_add_i32 s29, s36, 0x300 -; SI-NEXT: s_add_i32 s35, s35, 3 -; SI-NEXT: v_readlane_b32 s36, v61, 36 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, s95 -; SI-NEXT: s_and_b32 s35, s35, 0xff -; SI-NEXT: s_lshl_b32 s36, s36, 8 -; SI-NEXT: s_or_b32 s35, s36, s35 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_add_i32 s7, s60, 0x300 -; SI-NEXT: s_add_i32 s8, s61, 0x300 -; SI-NEXT: s_add_i32 s9, s62, 0x300 -; SI-NEXT: s_addk_i32 s10, 0x300 -; SI-NEXT: s_addk_i32 s22, 0x300 -; SI-NEXT: s_add_i32 s40, s39, 0x300 -; SI-NEXT: s_add_i32 s41, s52, 0x300 -; SI-NEXT: s_add_i32 s42, s53, 0x300 -; SI-NEXT: s_add_i32 s43, s64, 0x300 -; SI-NEXT: s_add_i32 s44, s66, 0x300 -; SI-NEXT: s_add_i32 s45, s67, 0x300 -; SI-NEXT: s_add_i32 s46, s68, 0x300 -; SI-NEXT: s_add_i32 s47, s69, 0x300 -; SI-NEXT: s_add_i32 s56, s70, 0x300 -; SI-NEXT: s_add_i32 s57, s71, 0x300 -; SI-NEXT: s_add_i32 s58, s81, 0x300 -; SI-NEXT: s_add_i32 s59, s83, 0x300 -; SI-NEXT: s_add_i32 s60, s85, 0x300 -; SI-NEXT: s_add_i32 s61, s96, 0x300 -; SI-NEXT: s_add_i32 s62, s97, 0x300 -; SI-NEXT: s_addk_i32 s63, 0x300 -; SI-NEXT: s_addk_i32 s88, 0x300 -; SI-NEXT: s_addk_i32 s23, 0x300 -; SI-NEXT: s_addk_i32 s18, 0x300 -; SI-NEXT: s_addk_i32 s17, 0x300 -; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: s_addk_i32 s20, 0x300 -; SI-NEXT: s_addk_i32 s24, 0x300 -; SI-NEXT: s_addk_i32 s27, 0x300 -; SI-NEXT: s_addk_i32 s72, 0x300 -; SI-NEXT: s_addk_i32 s73, 0x300 -; SI-NEXT: s_addk_i32 s74, 0x300 -; SI-NEXT: s_addk_i32 s76, 0x300 -; SI-NEXT: s_addk_i32 s77, 0x300 -; SI-NEXT: s_addk_i32 s78, 0x300 -; SI-NEXT: s_addk_i32 s79, 0x300 -; SI-NEXT: s_addk_i32 s91, 0x300 -; SI-NEXT: s_addk_i32 s90, 0x300 -; SI-NEXT: s_addk_i32 s92, 0x300 -; SI-NEXT: s_addk_i32 s93, 0x300 -; SI-NEXT: s_addk_i32 s94, 0x300 -; SI-NEXT: s_addk_i32 s30, 0x300 -; SI-NEXT: s_addk_i32 s31, 0x300 -; SI-NEXT: s_addk_i32 s34, 0x300 -; SI-NEXT: s_addk_i32 s35, 0x300 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s35 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s34 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s30 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, s94 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: .LBB93_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_readlane_b32 s99, v63, 35 -; SI-NEXT: v_or_b32_e32 v31, v31, v32 -; SI-NEXT: v_readlane_b32 s98, v63, 34 -; SI-NEXT: v_readlane_b32 s97, v63, 33 -; SI-NEXT: v_readlane_b32 s96, v63, 32 -; SI-NEXT: v_readlane_b32 s87, v63, 31 -; SI-NEXT: v_readlane_b32 s86, v63, 30 -; SI-NEXT: v_readlane_b32 s85, v63, 29 -; SI-NEXT: v_readlane_b32 s84, v63, 28 -; SI-NEXT: v_readlane_b32 s83, v63, 27 -; SI-NEXT: v_readlane_b32 s82, v63, 26 -; SI-NEXT: v_readlane_b32 s81, v63, 25 -; SI-NEXT: v_readlane_b32 s80, v63, 24 -; SI-NEXT: v_readlane_b32 s71, v63, 23 -; SI-NEXT: v_readlane_b32 s70, v63, 22 -; SI-NEXT: v_readlane_b32 s69, v63, 21 -; SI-NEXT: v_readlane_b32 s68, v63, 20 -; SI-NEXT: v_readlane_b32 s67, v63, 19 -; SI-NEXT: v_readlane_b32 s66, v63, 18 -; SI-NEXT: v_readlane_b32 s65, v63, 17 -; SI-NEXT: v_readlane_b32 s64, v63, 16 -; SI-NEXT: v_readlane_b32 s55, v63, 15 -; SI-NEXT: v_readlane_b32 s54, v63, 14 -; SI-NEXT: v_readlane_b32 s53, v63, 13 -; SI-NEXT: v_readlane_b32 s52, v63, 12 -; SI-NEXT: v_readlane_b32 s51, v63, 11 -; SI-NEXT: v_readlane_b32 s50, v63, 10 -; SI-NEXT: v_readlane_b32 s49, v63, 9 -; SI-NEXT: v_readlane_b32 s48, v63, 8 -; SI-NEXT: v_readlane_b32 s39, v63, 7 -; SI-NEXT: v_readlane_b32 s38, v63, 6 -; SI-NEXT: v_readlane_b32 s37, v63, 5 -; SI-NEXT: v_readlane_b32 s36, v63, 4 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v40 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v20 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v22 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v28 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v25 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 -; SI-NEXT: v_or_b32_e32 v13, v18, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v27 -; SI-NEXT: v_or_b32_e32 v15, v19, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v33 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v34 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v37 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v38 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v49 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v50 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v53 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v54 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v43 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v41 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v47 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v45 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v46 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v59 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v57 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v58 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v60 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v29, v34, v29 -; SI-NEXT: v_or_b32_e32 v30, v33, v30 -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB93_4: -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: s_mov_b32 s18, s54 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_mov_b32 s17, s15 -; SI-NEXT: s_mov_b32 s15, s50 -; SI-NEXT: s_mov_b32 s23, s34 -; SI-NEXT: s_mov_b32 s21, s39 -; SI-NEXT: s_mov_b32 s29, s30 -; SI-NEXT: s_mov_b32 s79, s92 -; SI-NEXT: s_mov_b32 s16, s59 -; SI-NEXT: s_mov_b32 s58, s57 -; SI-NEXT: s_mov_b32 s56, s47 -; SI-NEXT: s_mov_b32 s41, s13 -; SI-NEXT: s_mov_b32 s12, s11 -; SI-NEXT: s_mov_b32 s7, s85 -; SI-NEXT: s_mov_b32 s28, s26 -; SI-NEXT: s_mov_b32 s26, s25 -; SI-NEXT: s_mov_b32 s85, s97 -; SI-NEXT: s_mov_b32 s97, s19 -; SI-NEXT: s_mov_b32 s25, s69 -; SI-NEXT: s_mov_b32 s69, s81 -; SI-NEXT: s_mov_b32 s37, s66 -; SI-NEXT: s_mov_b32 s66, s53 -; SI-NEXT: s_mov_b32 s53, s36 -; SI-NEXT: s_mov_b32 s36, s89 -; SI-NEXT: s_mov_b32 s89, s95 -; SI-NEXT: s_mov_b32 s44, s45 -; SI-NEXT: s_mov_b32 s14, s48 -; SI-NEXT: s_mov_b32 s98, s76 -; SI-NEXT: s_mov_b32 s90, s68 -; SI-NEXT: s_mov_b32 s38, s31 -; SI-NEXT: s_mov_b32 s6, s99 -; SI-NEXT: s_mov_b32 s35, s87 -; SI-NEXT: v_readlane_b32 s99, v61, 35 -; SI-NEXT: v_readlane_b32 s96, v61, 34 -; SI-NEXT: v_readlane_b32 s55, v61, 33 -; SI-NEXT: v_readlane_b32 s82, v61, 31 -; SI-NEXT: v_readlane_b32 s86, v61, 32 -; SI-NEXT: v_readlane_b32 s83, v61, 29 -; SI-NEXT: v_readlane_b32 s87, v61, 30 -; SI-NEXT: v_readlane_b32 s84, v61, 27 -; SI-NEXT: v_readlane_b32 s51, v61, 28 -; SI-NEXT: v_readlane_b32 s80, v61, 26 -; SI-NEXT: v_readlane_b32 s71, v61, 25 -; SI-NEXT: v_readlane_b32 s49, v61, 23 -; SI-NEXT: v_readlane_b32 s70, v61, 24 -; SI-NEXT: v_readlane_b32 s65, v61, 21 -; SI-NEXT: v_readlane_b32 s67, v61, 22 -; SI-NEXT: v_readlane_b32 s54, v61, 19 -; SI-NEXT: v_readlane_b32 s64, v61, 20 -; SI-NEXT: v_readlane_b32 s50, v61, 18 -; SI-NEXT: v_readlane_b32 s34, v61, 17 -; SI-NEXT: v_readlane_b32 s52, v61, 15 -; SI-NEXT: v_readlane_b32 s48, v61, 16 -; SI-NEXT: v_readlane_b32 s30, v61, 13 -; SI-NEXT: v_readlane_b32 s39, v61, 11 -; SI-NEXT: v_readlane_b32 s92, v61, 12 -; SI-NEXT: v_readlane_b32 s77, v61, 10 -; SI-NEXT: v_readlane_b32 s75, v61, 14 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s8, s13, 24 +; SI-NEXT: s_or_b32 s9, s8, s6 +; SI-NEXT: v_readlane_b32 s6, v43, 3 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: v_readlane_b32 s8, v43, 2 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_writelane_b32 v42, s9, 53 +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s8, s41, 0xff +; SI-NEXT: s_lshl_b32 s9, s40, 8 +; SI-NEXT: s_or_b32 s9, s8, s9 +; SI-NEXT: s_and_b32 s8, s14, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s10, s15, 24 +; SI-NEXT: s_or_b32 s11, s10, s8 +; SI-NEXT: s_and_b32 s8, s42, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s10, s43, 24 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: v_writelane_b32 v42, s11, 54 +; SI-NEXT: s_or_b32 s8, s10, s8 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s10, vcc_lo, 0xff +; SI-NEXT: s_lshl_b32 s11, s78, 8 +; SI-NEXT: s_or_b32 s11, s10, s11 +; SI-NEXT: s_and_b32 s10, s76, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s12, s77, 24 +; SI-NEXT: s_or_b32 s13, s12, s10 +; SI-NEXT: s_and_b32 s10, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s12, s38, 24 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: v_writelane_b32 v42, s13, 55 +; SI-NEXT: s_or_b32 s10, s12, s10 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_and_b32 s12, s50, 0xff +; SI-NEXT: s_lshl_b32 s13, s49, 8 +; SI-NEXT: s_or_b32 s13, s12, s13 +; SI-NEXT: s_and_b32 s12, s39, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s14, s48, 24 +; SI-NEXT: s_or_b32 s27, s14, s12 +; SI-NEXT: s_and_b32 s12, s51, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s14, s52, 24 +; SI-NEXT: s_or_b32 s12, s14, s12 +; SI-NEXT: s_and_b32 s14, s64, 0xff +; SI-NEXT: s_lshl_b32 s15, s55, 8 +; SI-NEXT: s_or_b32 s15, s14, s15 +; SI-NEXT: s_and_b32 s14, s53, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s16, s54, 24 +; SI-NEXT: s_or_b32 s26, s16, s14 +; SI-NEXT: s_and_b32 s14, s65, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s25, s66, 24 +; SI-NEXT: s_or_b32 s14, s25, s14 +; SI-NEXT: s_and_b32 s25, s85, 0xff +; SI-NEXT: s_lshl_b32 s40, s68, 8 +; SI-NEXT: s_or_b32 s41, s25, s40 +; SI-NEXT: s_and_b32 s25, s69, 0xff +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: s_lshl_b32 s40, s70, 24 +; SI-NEXT: s_or_b32 s16, s40, s25 +; SI-NEXT: s_and_b32 s40, s37, 0xff +; SI-NEXT: s_lshl_b32 s40, s40, 16 +; SI-NEXT: s_lshl_b32 s42, s87, 24 +; SI-NEXT: s_or_b32 s40, s42, s40 +; SI-NEXT: s_and_b32 s42, s20, 0xff +; SI-NEXT: s_lshl_b32 s43, s30, 8 +; SI-NEXT: s_or_b32 s43, s42, s43 +; SI-NEXT: s_and_b32 s42, s71, 0xff +; SI-NEXT: s_lshl_b32 s42, s42, 16 +; SI-NEXT: s_lshl_b32 s76, s67, 24 +; SI-NEXT: s_or_b32 s69, s76, s42 +; SI-NEXT: s_and_b32 s42, s19, 0xff +; SI-NEXT: s_lshl_b32 s42, s42, 16 +; SI-NEXT: s_lshl_b32 s76, s18, 24 +; SI-NEXT: s_or_b32 s42, s76, s42 +; SI-NEXT: s_and_b32 s76, s96, 0xff +; SI-NEXT: s_lshl_b32 s77, s23, 8 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s86, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s34, 24 +; SI-NEXT: s_or_b32 s70, s78, s77 +; SI-NEXT: s_and_b32 s77, s31, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s35, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: s_or_b32 vcc_lo, s78, s77 +; SI-NEXT: s_or_b32 vcc_hi, s76, s70 +; SI-NEXT: s_and_b32 s76, s94, 0xff +; SI-NEXT: s_lshl_b32 s77, s17, 8 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s84, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s92, 24 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_or_b32 s71, s78, s77 +; SI-NEXT: s_and_b32 s77, s24, 0xff +; SI-NEXT: s_or_b32 s41, s41, s16 +; SI-NEXT: s_mov_b32 s37, s16 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s21, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s16, v43, 36 +; SI-NEXT: s_or_b32 s38, s78, s77 +; SI-NEXT: s_or_b32 s39, s76, s71 +; SI-NEXT: s_and_b32 s76, s16, 0xff +; SI-NEXT: s_lshl_b32 s77, s99, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 35 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 34 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: v_writelane_b32 v42, s80, 56 +; SI-NEXT: s_or_b32 s80, s78, s77 +; SI-NEXT: s_and_b32 s77, s97, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s88, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s16, v43, 33 +; SI-NEXT: s_or_b32 s48, s78, s77 +; SI-NEXT: s_or_b32 s49, s76, s80 +; SI-NEXT: s_and_b32 s76, s98, 0xff +; SI-NEXT: s_lshl_b32 s77, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 32 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 31 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: v_writelane_b32 v42, s81, 57 +; SI-NEXT: s_or_b32 s81, s78, s77 +; SI-NEXT: s_and_b32 s77, s89, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s90, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s16, v43, 30 +; SI-NEXT: s_or_b32 s50, s78, s77 +; SI-NEXT: s_or_b32 s51, s76, s81 +; SI-NEXT: s_and_b32 s76, s79, 0xff +; SI-NEXT: s_lshl_b32 s77, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 29 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s82, 24 +; SI-NEXT: v_writelane_b32 v42, s82, 58 +; SI-NEXT: s_or_b32 s82, s78, s77 +; SI-NEXT: s_and_b32 s77, s36, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s83, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s16, v43, 26 +; SI-NEXT: s_or_b32 s52, s78, s77 +; SI-NEXT: s_or_b32 s53, s76, s82 +; SI-NEXT: s_and_b32 s76, s91, 0xff +; SI-NEXT: s_lshl_b32 s77, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 25 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 24 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: v_readlane_b32 s16, v43, 28 +; SI-NEXT: s_or_b32 s83, s78, s77 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 27 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s16, v43, 21 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_or_b32 s55, s76, s83 +; SI-NEXT: s_and_b32 s76, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 20 +; SI-NEXT: s_or_b32 s54, s78, s77 +; SI-NEXT: s_lshl_b32 s77, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 19 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 18 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: v_readlane_b32 s16, v43, 23 +; SI-NEXT: s_or_b32 s84, s78, s77 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 22 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s16, v43, 16 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_or_b32 s65, s76, s84 +; SI-NEXT: s_and_b32 s76, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 15 +; SI-NEXT: s_or_b32 s64, s78, s77 +; SI-NEXT: s_lshl_b32 s77, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 14 +; SI-NEXT: v_writelane_b32 v42, s93, 59 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 13 +; SI-NEXT: v_writelane_b32 v42, s90, 60 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: s_or_b32 s96, s78, s77 +; SI-NEXT: s_and_b32 s77, s44, 0xff +; SI-NEXT: v_readlane_b32 s25, v43, 17 +; SI-NEXT: v_readlane_b32 s16, v42, 51 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s25, 24 +; SI-NEXT: s_and_b32 s44, s16, 0xffff +; SI-NEXT: s_lshr_b64 s[16:17], vcc, 16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_or_b32 s66, s78, s77 +; SI-NEXT: s_mov_b32 s77, s22 +; SI-NEXT: s_and_b32 s46, s46, 0xffff +; SI-NEXT: s_and_b32 s22, s73, 0xffff +; SI-NEXT: v_readlane_b32 s17, v42, 52 +; SI-NEXT: s_or_b32 s13, s13, s27 +; SI-NEXT: s_or_b32 s15, s15, s26 +; SI-NEXT: s_mov_b32 s93, s88 +; SI-NEXT: s_mov_b32 s88, s98 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_and_b32 s98, s62, 0xffff +; SI-NEXT: s_or_b32 s62, s46, s8 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: s_or_b32 s8, s22, s54 +; SI-NEXT: s_mov_b32 s22, s77 +; SI-NEXT: s_lshr_b32 s77, s17, 16 +; SI-NEXT: v_readlane_b32 s17, v42, 53 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_mov_b32 s90, s89 +; SI-NEXT: s_mov_b32 s89, s79 +; SI-NEXT: s_mov_b32 s79, s91 +; SI-NEXT: s_mov_b32 s91, s99 +; SI-NEXT: s_or_b32 s67, s76, s96 +; SI-NEXT: s_and_b32 s47, s47, 0xffff +; SI-NEXT: s_and_b32 s56, s56, 0xffff +; SI-NEXT: s_and_b32 s57, s57, 0xffff +; SI-NEXT: s_and_b32 s30, s58, 0xffff +; SI-NEXT: s_and_b32 s86, s61, 0xffff +; SI-NEXT: s_and_b32 s85, s63, 0xffff +; SI-NEXT: s_and_b32 s87, s72, 0xffff +; SI-NEXT: s_and_b32 s68, s74, 0xffff +; SI-NEXT: s_and_b32 s99, s75, 0xffff +; SI-NEXT: s_or_b32 s74, s44, s4 +; SI-NEXT: s_mov_b32 s75, s5 +; SI-NEXT: s_lshr_b64 s[18:19], s[4:5], 16 +; SI-NEXT: s_or_b32 s72, s45, s6 +; SI-NEXT: s_mov_b32 s73, s7 +; SI-NEXT: s_lshr_b64 s[20:21], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[14:15], 16 +; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: v_readlane_b32 s17, v42, 54 +; SI-NEXT: s_or_b32 s43, s43, s69 +; SI-NEXT: s_and_b32 s34, s59, 0xffff +; SI-NEXT: s_and_b32 s36, s60, 0xffff +; SI-NEXT: s_mov_b32 s63, s9 +; SI-NEXT: s_or_b32 s60, s47, s10 +; SI-NEXT: s_mov_b32 s61, s11 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 16 +; SI-NEXT: s_or_b32 s58, s56, s12 +; SI-NEXT: s_mov_b32 s59, s13 +; SI-NEXT: s_or_b32 s56, s57, s14 +; SI-NEXT: s_mov_b32 s57, s15 +; SI-NEXT: s_or_b32 s46, s30, s40 +; SI-NEXT: s_mov_b32 s94, s6 +; SI-NEXT: s_mov_b32 s92, s4 +; SI-NEXT: s_mov_b32 s47, s41 +; SI-NEXT: s_lshr_b64 s[30:31], s[40:41], 16 +; SI-NEXT: s_or_b32 s40, s86, s38 +; SI-NEXT: s_mov_b32 s41, s39 +; SI-NEXT: s_lshr_b64 s[38:39], s[38:39], 16 +; SI-NEXT: s_or_b32 s14, s98, s48 +; SI-NEXT: s_mov_b32 s15, s49 +; SI-NEXT: s_lshr_b64 s[48:49], s[48:49], 16 +; SI-NEXT: s_or_b32 s12, s85, s50 +; SI-NEXT: s_mov_b32 s13, s51 +; SI-NEXT: s_lshr_b64 s[50:51], s[50:51], 16 +; SI-NEXT: s_or_b32 s10, s87, s52 +; SI-NEXT: s_mov_b32 s11, s53 +; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 16 +; SI-NEXT: s_mov_b32 s9, s55 +; SI-NEXT: s_lshr_b64 s[54:55], s[54:55], 16 +; SI-NEXT: s_or_b32 s6, s68, s64 +; SI-NEXT: s_mov_b32 s7, s65 +; SI-NEXT: s_lshr_b64 s[64:65], s[64:65], 16 +; SI-NEXT: s_or_b32 s4, s99, s66 +; SI-NEXT: s_mov_b32 s5, s67 +; SI-NEXT: s_lshr_b64 s[66:67], s[66:67], 16 +; SI-NEXT: s_lshr_b32 s78, s17, 16 +; SI-NEXT: v_readlane_b32 s17, v42, 55 +; SI-NEXT: s_or_b32 s44, s34, s42 +; SI-NEXT: s_mov_b32 s45, s43 +; SI-NEXT: s_lshr_b64 s[34:35], s[42:43], 16 +; SI-NEXT: s_or_b32 s42, s36, vcc_lo +; SI-NEXT: s_mov_b32 s43, vcc_hi +; SI-NEXT: s_lshr_b32 s67, s17, 16 +; SI-NEXT: s_lshr_b32 s27, s27, 16 +; SI-NEXT: s_lshr_b32 s55, s26, 16 +; SI-NEXT: s_lshr_b32 s36, s37, 16 +; SI-NEXT: s_lshr_b32 s69, s69, 16 +; SI-NEXT: s_lshr_b32 s65, s70, 16 +; SI-NEXT: s_lshr_b32 s71, s71, 16 +; SI-NEXT: s_lshr_b32 s37, s80, 16 +; SI-NEXT: v_readlane_b32 s80, v42, 56 +; SI-NEXT: s_lshr_b32 s39, s81, 16 +; SI-NEXT: v_readlane_b32 s81, v42, 57 +; SI-NEXT: s_lshr_b32 s49, s82, 16 +; SI-NEXT: v_readlane_b32 s82, v42, 58 +; SI-NEXT: s_lshr_b32 s51, s83, 16 +; SI-NEXT: s_mov_b32 s99, s91 +; SI-NEXT: s_mov_b32 s91, s79 +; SI-NEXT: s_mov_b32 s98, s88 +; SI-NEXT: s_mov_b32 s79, s89 +; SI-NEXT: s_mov_b32 s89, s90 +; SI-NEXT: v_readlane_b32 s90, v42, 60 +; SI-NEXT: s_mov_b32 s88, s93 +; SI-NEXT: v_readlane_b32 s93, v42, 59 +; SI-NEXT: s_lshr_b32 s53, s84, 16 +; SI-NEXT: s_mov_b32 s68, s16 +; SI-NEXT: s_lshr_b32 s70, s96, 16 +; SI-NEXT: s_cbranch_execnz .LBB93_3 +; SI-NEXT: .LBB93_2: ; %cmp.true +; SI-NEXT: v_readlane_b32 s4, v43, 40 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v43, 38 +; SI-NEXT: v_readlane_b32 s6, v43, 37 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v43, 17 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v43, 16 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: v_readlane_b32 s6, v43, 15 +; SI-NEXT: v_readlane_b32 s7, v43, 14 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_readlane_b32 s6, v43, 13 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_readlane_b32 s6, v43, 43 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v43, 41 +; SI-NEXT: v_readlane_b32 s8, v43, 23 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_readlane_b32 s7, v43, 22 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_readlane_b32 s7, v43, 21 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: v_readlane_b32 s8, v43, 20 +; SI-NEXT: v_readlane_b32 s9, v43, 19 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_readlane_b32 s8, v43, 18 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s8, s80, 3 +; SI-NEXT: v_readlane_b32 s10, v43, 28 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s9, s93, 8 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_readlane_b32 s9, v43, 27 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_add_i32 s9, s91, 3 +; SI-NEXT: v_readlane_b32 s10, v43, 26 +; SI-NEXT: v_readlane_b32 s11, v43, 25 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 8 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: v_readlane_b32 s10, v43, 24 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: v_readlane_b32 s10, v43, 45 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: v_readlane_b32 s11, v43, 44 +; SI-NEXT: v_readlane_b32 s12, v43, 42 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 8 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_readlane_b32 s11, v43, 39 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 24 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s11, s79, 3 +; SI-NEXT: v_readlane_b32 s12, v43, 30 +; SI-NEXT: v_readlane_b32 s13, v43, 29 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: s_lshl_b32 s12, s82, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_add_i32 s12, s81, 3 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_lshl_b32 s13, s95, 8 +; SI-NEXT: s_add_i32 s14, s89, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: s_lshl_b32 s13, s90, 24 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_addk_i32 s12, 0x300 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_add_i32 s13, s98, 3 +; SI-NEXT: v_readlane_b32 s14, v43, 33 +; SI-NEXT: v_readlane_b32 s15, v43, 32 +; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: v_readlane_b32 s14, v43, 31 +; SI-NEXT: s_and_b32 s15, s15, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 24 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: v_readlane_b32 s14, v42, 50 +; SI-NEXT: s_add_i32 s19, s14, 3 +; SI-NEXT: v_readlane_b32 s15, v42, 49 +; SI-NEXT: v_readlane_b32 s16, v42, 48 +; SI-NEXT: s_and_b32 s14, s19, 0xff +; SI-NEXT: s_lshl_b32 s15, s15, 8 +; SI-NEXT: s_add_i32 s18, s16, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s16, s18, 0xff +; SI-NEXT: s_lshl_b32 s15, s88, 24 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_addk_i32 s14, 0x300 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: v_readlane_b32 s15, v43, 36 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 35 +; SI-NEXT: s_and_b32 s15, s15, 0xff +; SI-NEXT: s_lshl_b32 s16, s99, 8 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: v_readlane_b32 s16, v43, 34 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_addk_i32 s15, 0x300 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: v_readlane_b32 s16, v42, 47 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 46 +; SI-NEXT: v_readlane_b32 s18, v42, 40 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s97, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 41 +; SI-NEXT: s_and_b32 s18, s97, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 36 +; SI-NEXT: s_add_i32 s85, s17, 3 +; SI-NEXT: v_readlane_b32 s18, v42, 34 +; SI-NEXT: v_readlane_b32 s19, v42, 28 +; SI-NEXT: s_and_b32 s17, s85, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 8 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_readlane_b32 s18, v42, 29 +; SI-NEXT: s_and_b32 s19, s19, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_addk_i32 s17, 0x300 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_add_i32 s40, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 45 +; SI-NEXT: s_add_i32 s41, s17, 0x3000000 +; SI-NEXT: s_add_i32 s23, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 42 +; SI-NEXT: v_readlane_b32 s18, v42, 38 +; SI-NEXT: s_and_b32 s16, s23, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s87, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 39 +; SI-NEXT: s_and_b32 s18, s87, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 33 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_readlane_b32 s18, v42, 31 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 8 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_readlane_b32 s18, v42, 24 +; SI-NEXT: s_addk_i32 s17, 0x300 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s42, s16, 0x3000000 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: v_readlane_b32 s17, v42, 23 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s43, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 44 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 43 +; SI-NEXT: v_readlane_b32 s18, v42, 37 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s86, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 35 +; SI-NEXT: s_and_b32 s18, s86, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s44, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 27 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 22 +; SI-NEXT: v_readlane_b32 s18, v42, 17 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 14 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s45, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 32 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 30 +; SI-NEXT: v_readlane_b32 s18, v42, 25 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 26 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s46, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 21 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 20 +; SI-NEXT: v_readlane_b32 s18, v42, 15 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 16 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s47, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 19 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 18 +; SI-NEXT: v_readlane_b32 s18, v42, 12 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 13 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s56, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 11 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 10 +; SI-NEXT: v_readlane_b32 s18, v42, 6 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 7 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s57, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 9 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 8 +; SI-NEXT: v_readlane_b32 s18, v42, 4 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 5 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s58, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 2 +; SI-NEXT: v_readlane_b32 s18, v43, 62 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 63 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s59, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 1 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 0 +; SI-NEXT: v_readlane_b32 s18, v43, 60 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 61 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s60, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 59 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 58 +; SI-NEXT: v_readlane_b32 s18, v43, 54 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 55 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s61, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 57 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 56 +; SI-NEXT: v_readlane_b32 s18, v43, 52 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 53 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s62, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 51 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 50 +; SI-NEXT: v_readlane_b32 s18, v43, 48 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 49 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s63, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 5 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 4 +; SI-NEXT: v_readlane_b32 s18, v43, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 2 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s72, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 1 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 0 +; SI-NEXT: v_readlane_b32 s18, v43, 46 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 47 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s73, s16, 0x3000000 +; SI-NEXT: s_add_i32 s16, s22, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 12 +; SI-NEXT: v_readlane_b32 s18, v43, 11 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 10 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s74, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 9 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 8 +; SI-NEXT: v_readlane_b32 s18, v43, 7 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 6 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s12, s12, 0x3000000 +; SI-NEXT: s_add_i32 s13, s13, 0x3000000 +; SI-NEXT: s_add_i32 s14, s14, 0x3000000 +; SI-NEXT: s_add_i32 s15, s15, 0x3000000 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s75, s16, 0x3000000 +; SI-NEXT: s_lshr_b64 s[68:69], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[50:51], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[64:65], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[18:19], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[72:73], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[44:45], 16 +; SI-NEXT: s_lshr_b32 s77, s75, 16 +; SI-NEXT: s_lshr_b32 s76, s73, 16 +; SI-NEXT: s_lshr_b32 s78, s63, 16 +; SI-NEXT: s_lshr_b32 s67, s61, 16 +; SI-NEXT: s_lshr_b32 s27, s59, 16 +; SI-NEXT: s_lshr_b32 s55, s57, 16 +; SI-NEXT: s_lshr_b32 s36, s47, 16 +; SI-NEXT: s_lshr_b32 s69, s45, 16 +; SI-NEXT: s_lshr_b32 s65, s43, 16 +; SI-NEXT: s_lshr_b32 s71, s41, 16 +; SI-NEXT: s_lshr_b32 s37, s15, 16 +; SI-NEXT: s_lshr_b32 s39, s13, 16 +; SI-NEXT: s_lshr_b32 s49, s11, 16 +; SI-NEXT: s_lshr_b32 s51, s9, 16 +; SI-NEXT: s_lshr_b32 s53, s7, 16 +; SI-NEXT: s_lshr_b32 s70, s5, 16 +; SI-NEXT: .LBB93_3: ; %end +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s16, s74, 0xffff +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s16, s16, s18 +; SI-NEXT: s_and_b32 s18, s75, 0xffff +; SI-NEXT: s_lshl_b32 s19, s77, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s72, 0xffff +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s73, 0xffff +; SI-NEXT: s_lshl_b32 s21, s76, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_and_b32 s21, s62, 0xffff +; SI-NEXT: s_lshl_b32 s22, s24, 16 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_and_b32 s22, s63, 0xffff +; SI-NEXT: s_lshl_b32 s23, s78, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_and_b32 s23, s60, 0xffff +; SI-NEXT: s_lshl_b32 s24, s28, 16 +; SI-NEXT: s_or_b32 s23, s23, s24 +; SI-NEXT: s_and_b32 s24, s61, 0xffff +; SI-NEXT: s_lshl_b32 s25, s67, 16 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_and_b32 s25, s58, 0xffff +; SI-NEXT: s_lshl_b32 s26, s92, 16 +; SI-NEXT: s_or_b32 s25, s25, s26 +; SI-NEXT: s_and_b32 s26, s59, 0xffff +; SI-NEXT: s_lshl_b32 s27, s27, 16 +; SI-NEXT: s_or_b32 s26, s26, s27 +; SI-NEXT: s_and_b32 s27, s56, 0xffff +; SI-NEXT: s_lshl_b32 s28, s94, 16 +; SI-NEXT: s_or_b32 s27, s27, s28 +; SI-NEXT: s_and_b32 s28, s57, 0xffff +; SI-NEXT: s_lshl_b32 s29, s55, 16 +; SI-NEXT: s_or_b32 s28, s28, s29 +; SI-NEXT: s_and_b32 s29, s46, 0xffff +; SI-NEXT: s_lshl_b32 s46, s30, 16 +; SI-NEXT: s_or_b32 s29, s29, s46 +; SI-NEXT: s_and_b32 s46, s47, 0xffff +; SI-NEXT: s_lshl_b32 s47, s36, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_lshl_b32 s47, s34, 16 +; SI-NEXT: s_or_b32 s44, s44, s47 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_lshl_b32 s47, s69, 16 +; SI-NEXT: s_or_b32 s45, s45, s47 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_lshl_b32 s47, s68, 16 +; SI-NEXT: s_or_b32 s42, s42, s47 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s47, s65, 16 +; SI-NEXT: s_or_b32 s43, s43, s47 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_lshl_b32 s47, s38, 16 +; SI-NEXT: s_or_b32 s40, s40, s47 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s47, s71, 16 +; SI-NEXT: s_or_b32 s41, s41, s47 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s47, s48, 16 +; SI-NEXT: s_or_b32 s14, s14, s47 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s47, s37, 16 +; SI-NEXT: s_or_b32 s15, s15, s47 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s47, s50, 16 +; SI-NEXT: s_or_b32 s12, s12, s47 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s47, s39, 16 +; SI-NEXT: s_or_b32 s13, s13, s47 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s47, s52, 16 +; SI-NEXT: s_or_b32 s10, s10, s47 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s47, s49, 16 +; SI-NEXT: s_or_b32 s11, s11, s47 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s47, s54, 16 +; SI-NEXT: s_or_b32 s8, s8, s47 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s47, s51, 16 +; SI-NEXT: s_or_b32 s9, s9, s47 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s47, s64, 16 +; SI-NEXT: s_or_b32 s6, s6, s47 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s47, s53, 16 +; SI-NEXT: s_or_b32 s7, s7, s47 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s47, s66, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s17, s70, 16 +; SI-NEXT: s_or_b32 s4, s4, s47 +; SI-NEXT: s_or_b32 s5, s5, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s18 +; SI-NEXT: v_mov_b32_e32 v2, s19 +; SI-NEXT: v_mov_b32_e32 v3, s20 +; SI-NEXT: v_mov_b32_e32 v4, s21 +; SI-NEXT: v_mov_b32_e32 v5, s22 +; SI-NEXT: v_mov_b32_e32 v6, s23 +; SI-NEXT: v_mov_b32_e32 v7, s24 +; SI-NEXT: v_mov_b32_e32 v8, s25 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s27 +; SI-NEXT: v_mov_b32_e32 v11, s28 +; SI-NEXT: v_mov_b32_e32 v12, s29 +; SI-NEXT: v_mov_b32_e32 v13, s46 +; SI-NEXT: v_mov_b32_e32 v14, s44 +; SI-NEXT: v_mov_b32_e32 v15, s45 +; SI-NEXT: v_mov_b32_e32 v16, s42 +; SI-NEXT: v_mov_b32_e32 v17, s43 +; SI-NEXT: v_mov_b32_e32 v18, s40 +; SI-NEXT: v_mov_b32_e32 v19, s41 +; SI-NEXT: v_mov_b32_e32 v20, s14 +; SI-NEXT: v_mov_b32_e32 v21, s15 +; SI-NEXT: v_mov_b32_e32 v22, s12 +; SI-NEXT: v_mov_b32_e32 v23, s13 +; SI-NEXT: v_mov_b32_e32 v24, s10 +; SI-NEXT: v_mov_b32_e32 v25, s11 +; SI-NEXT: v_mov_b32_e32 v26, s8 +; SI-NEXT: v_mov_b32_e32 v27, s9 +; SI-NEXT: v_mov_b32_e32 v28, s6 +; SI-NEXT: v_mov_b32_e32 v29, s7 +; SI-NEXT: v_mov_b32_e32 v30, s4 +; SI-NEXT: v_mov_b32_e32 v31, s5 +; SI-NEXT: v_readlane_b32 s99, v41, 35 +; SI-NEXT: v_readlane_b32 s98, v41, 34 +; SI-NEXT: v_readlane_b32 s97, v41, 33 +; SI-NEXT: v_readlane_b32 s96, v41, 32 +; SI-NEXT: v_readlane_b32 s87, v41, 31 +; SI-NEXT: v_readlane_b32 s86, v41, 30 +; SI-NEXT: v_readlane_b32 s85, v41, 29 +; SI-NEXT: v_readlane_b32 s84, v41, 28 +; SI-NEXT: v_readlane_b32 s83, v41, 27 +; SI-NEXT: v_readlane_b32 s82, v41, 26 +; SI-NEXT: v_readlane_b32 s81, v41, 25 +; SI-NEXT: v_readlane_b32 s80, v41, 24 +; SI-NEXT: v_readlane_b32 s71, v41, 23 +; SI-NEXT: v_readlane_b32 s70, v41, 22 +; SI-NEXT: v_readlane_b32 s69, v41, 21 +; SI-NEXT: v_readlane_b32 s68, v41, 20 +; SI-NEXT: v_readlane_b32 s67, v41, 19 +; SI-NEXT: v_readlane_b32 s66, v41, 18 +; SI-NEXT: v_readlane_b32 s65, v41, 17 +; SI-NEXT: v_readlane_b32 s64, v41, 16 +; SI-NEXT: v_readlane_b32 s55, v41, 15 +; SI-NEXT: v_readlane_b32 s54, v41, 14 +; SI-NEXT: v_readlane_b32 s53, v41, 13 +; SI-NEXT: v_readlane_b32 s52, v41, 12 +; SI-NEXT: v_readlane_b32 s51, v41, 11 +; SI-NEXT: v_readlane_b32 s50, v41, 10 +; SI-NEXT: v_readlane_b32 s49, v41, 9 +; SI-NEXT: v_readlane_b32 s48, v41, 8 +; SI-NEXT: v_readlane_b32 s39, v41, 7 +; SI-NEXT: v_readlane_b32 s38, v41, 6 +; SI-NEXT: v_readlane_b32 s37, v41, 5 +; SI-NEXT: v_readlane_b32 s36, v41, 4 +; SI-NEXT: v_readlane_b32 s35, v41, 3 +; SI-NEXT: v_readlane_b32 s34, v41, 2 +; SI-NEXT: v_readlane_b32 s31, v41, 1 +; SI-NEXT: v_readlane_b32 s30, v41, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB93_4: +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr70 ; SI-NEXT: s_branch .LBB93_2 ; ; VI-LABEL: bitcast_v128i8_to_v64f16_scalar: @@ -180887,853 +176880,708 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v50 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v31, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v30 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v28 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v27 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v26 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v25 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v22 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v21 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v18 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v32 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v47 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v60, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v31 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v57 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB94_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v46, v13 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_mov_b32_e32 v10, v45 -; SI-NEXT: v_mov_b32_e32 v45, v14 -; SI-NEXT: v_mov_b32_e32 v14, v15 -; SI-NEXT: v_mov_b32_e32 v15, v18 -; SI-NEXT: v_mov_b32_e32 v18, v8 -; SI-NEXT: v_mov_b32_e32 v8, v6 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v44, v12, v5 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_or_b32_e32 v41, v17, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v54, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v53, v17, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_mov_b32_e32 v42, v40 +; SI-NEXT: v_mov_b32_e32 v6, v53 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v52, v2, v31 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v51, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v45 +; SI-NEXT: v_or_b32_e32 v2, v2, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v52, v17, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v51, v31, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v49, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v50, v31, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v50, v17, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v48, v5, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v40 +; SI-NEXT: v_mov_b32_e32 v40, v55 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v48, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v49, v5, v31 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v39, v17, v5 -; SI-NEXT: v_alignbit_b32 v5, v41, v44, 24 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v38, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v39, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v37, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v55 +; SI-NEXT: v_or_b32_e32 v36, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v2, v52, 24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v34, v5, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v53 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v35, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v2, v52, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v32, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v53 +; SI-NEXT: v_or_b32_e32 v33, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v2, v52, 8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v63, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_or_b32_e32 v31, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v50, v51, 24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v37, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v17, v5 -; SI-NEXT: v_alignbit_b32 v5, v41, v44, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v61, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_or_b32_e32 v62, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v50, v51, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v35, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v36, v17, v5 -; SI-NEXT: v_alignbit_b32 v5, v41, v44, 8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v59, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_or_b32_e32 v60, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v50, v51, 8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v20, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_or_b32_e32 v58, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v49, v48, 24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v33, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v34, v17, v5 -; SI-NEXT: v_alignbit_b32 v5, v53, v54, 24 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v56, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v49, v48, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v46, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v49, v48, 8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_or_b32_e32 v44, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v39, v38, 24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v31, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v46 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v32, v17, v5 -; SI-NEXT: v_alignbit_b32 v5, v53, v54, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_or_b32_e32 v16, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v41, v5, v7 +; SI-NEXT: v_alignbit_b32 v5, v39, v38, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v29, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 +; SI-NEXT: v_or_b32_e32 v5, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v30, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v5, v53, v54, 8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v54, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v39, v38, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v27, v17, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v28, v17, v5 -; SI-NEXT: v_alignbit_b32 v5, v52, v51, 24 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v37, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v59 -; SI-NEXT: v_or_b32_e32 v25, v58, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15 -; SI-NEXT: v_or_b32_e32 v26, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v5, v52, v51, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v37, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 -; SI-NEXT: v_or_b32_e32 v23, v61, v5 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v24, v60, v5 -; SI-NEXT: v_alignbit_b32 v5, v52, v51, 8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v37, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 -; SI-NEXT: v_or_b32_e32 v21, v1, v5 -; SI-NEXT: v_mov_b32_e32 v5, v16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_or_b32_e32 v22, v55, v1 -; SI-NEXT: v_alignbit_b32 v1, v50, v49, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v35, v34, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v19, v40, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_or_b32_e32 v20, v2, v1 -; SI-NEXT: v_alignbit_b32 v1, v50, v49, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v35, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_or_b32_e32 v16, v4, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_or_b32_e32 v17, v42, v1 -; SI-NEXT: v_alignbit_b32 v1, v50, v49, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v35, v34, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 +; SI-NEXT: v_alignbit_b32 v1, v33, v32, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v33, v32, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v38, v37, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v33, v32, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v38, v37, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v31, v63, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v38, v37, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v31, v63, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v31, v63, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16 +; SI-NEXT: v_alignbit_b32 v1, v62, v61, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v62, v61, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v33, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v62, v61, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v33, 16 +; SI-NEXT: v_alignbit_b32 v1, v60, v59, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v33, 8 +; SI-NEXT: v_alignbit_b32 v1, v60, v59, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v60, v59, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 +; SI-NEXT: v_alignbit_b32 v1, v58, v20, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v29, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v3, v17, v16, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v29, 8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v58, v20, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v58, v20, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v41 +; SI-NEXT: v_alignbit_b32 v1, v56, v19, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v28, v27, 24 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v56, v19, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v56, v19, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v46, v18, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v28, v27, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v46, v18, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v46, v18, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v52 +; SI-NEXT: v_alignbit_b32 v1, v44, v17, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v28, v27, 8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v50 +; SI-NEXT: v_alignbit_b32 v1, v44, v17, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v26, v25, 24 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v44, v17, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v16, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v16, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v16, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v39 +; SI-NEXT: v_alignbit_b32 v1, v54, v5, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v26, v25, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v54, v5, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v38 +; SI-NEXT: v_alignbit_b32 v1, v54, v5, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v26, v25, 8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v24, v23, 24 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v24, v23, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v24, v23, 8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v22, v21, 24 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v22, v21, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v22, v21, 8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v20, v19, 24 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v20, v19, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v45, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v43, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v20, v19, 8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v42, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v40, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v16, 24 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v55, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v3, v12, 8, 8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v6, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v3, v13, 8, 8 -; SI-NEXT: v_mov_b32_e32 v13, v46 -; SI-NEXT: v_mov_b32_e32 v46, v1 -; SI-NEXT: v_bfe_u32 v1, v57, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v53, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v6, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v14, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v8, v18 -; SI-NEXT: v_mov_b32_e32 v18, v15 -; SI-NEXT: v_mov_b32_e32 v15, v14 -; SI-NEXT: v_mov_b32_e32 v14, v45 -; SI-NEXT: v_mov_b32_e32 v45, v10 -; SI-NEXT: v_mov_b32_e32 v10, v47 +; SI-NEXT: v_bfe_u32 v1, v13, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v11, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v45, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v13, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v14, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v15, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v18, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v7, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v5, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -181769,342 +177617,344 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_alignbit_b32 v2, v17, v16, 16 +; SI-NEXT: v_mov_b32_e32 v45, v17 +; SI-NEXT: v_mov_b32_e32 v43, v16 +; SI-NEXT: v_mov_b32_e32 v55, v40 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v42, v5 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_bfe_u32 v6, v4, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v47, v2 -; SI-NEXT: v_bfe_u32 v12, v6, 8, 8 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: .LBB94_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB94_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_mov_b32_e32 v22, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v16, v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v42 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v42, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v54, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v42 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v17, v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_alignbit_b32 v46, v17, v16, 24 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_alignbit_b32 v47, v17, v16, 16 -; SI-NEXT: v_or_b32_e32 v19, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: v_or_b32_e32 v20, v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v43, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v41, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v21, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_or_b32_e32 v22, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v45, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v44, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v23, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v24, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v28, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v46, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v25, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_or_b32_e32 v26, v2, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v24, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v26, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v56, v1, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 -; SI-NEXT: v_or_b32_e32 v27, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_or_b32_e32 v22, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v28, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_or_b32_e32 v58, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v29, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v59, v2, v1 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v60, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v31, v4, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v30, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v61, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 -; SI-NEXT: v_or_b32_e32 v32, v1, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_or_b32_e32 v62, v1, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_or_b32_e32 v33, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_or_b32_e32 v63, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v34, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_or_b32_e32 v31, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v35, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v32, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v33, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v37, v4, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v36, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_or_b32_e32 v34, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_or_b32_e32 v38, v1, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_or_b32_e32 v35, v1, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_or_b32_e32 v48, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 +; SI-NEXT: v_or_b32_e32 v37, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v39, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_or_b32_e32 v36, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 +; SI-NEXT: v_or_b32_e32 v38, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v49, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v39, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v51, v4, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v48, v4, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v50, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 -; SI-NEXT: v_or_b32_e32 v52, v1, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_or_b32_e32 v49, v1, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v54, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v51, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v53, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v50, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -182114,275 +177964,292 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v44, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_or_b32_e32 v41, v3, v1 -; SI-NEXT: v_alignbit_b32 v1, v41, v44, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v41, v44, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v52, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v3, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v52, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v41, v44, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v2, v52, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v53, v54, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v2, v52, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v53, v54, 16 +; SI-NEXT: v_alignbit_b32 v1, v50, v51, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v53, v54, 8 +; SI-NEXT: v_alignbit_b32 v1, v50, v51, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v52, v51, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v50, v51, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v52, v51, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v49, v48, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v52, v51, 8 +; SI-NEXT: v_alignbit_b32 v1, v49, v48, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v50, v49, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v49, v48, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v50, v49, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v39, v38, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v50, v49, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v39, v38, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v39, v38, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v37, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v37, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v38, v37, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v37, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v38, v37, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v35, v34, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v38, v37, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v35, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v35, v34, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v32, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v32, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v32, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v31, v63, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v31, v63, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16 +; SI-NEXT: v_alignbit_b32 v1, v31, v63, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v62, v61, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v62, v61, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v33, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v62, v61, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v33, 16 +; SI-NEXT: v_alignbit_b32 v1, v60, v59, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v33, 8 +; SI-NEXT: v_alignbit_b32 v1, v60, v59, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v60, v59, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 +; SI-NEXT: v_alignbit_b32 v1, v58, v22, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v29, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v29, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v58, v22, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v58, v22, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v29, 8 +; SI-NEXT: v_alignbit_b32 v1, v56, v26, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v28, v27, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v56, v26, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v56, v26, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v46, v28, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v46, v28, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v46, v28, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v1, v44, v45, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v28, v27, 8 +; SI-NEXT: v_alignbit_b32 v1, v44, v45, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v26, v25, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v26, v25, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v44, v45, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v26, v25, 8 +; SI-NEXT: v_alignbit_b32 v1, v41, v43, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v24, v23, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v24, v23, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v24, v23, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v22, v21, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v41, v43, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v22, v21, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v41, v43, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v22, v21, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v54, v42, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v20, v19, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v54, v42, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v20, v19, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v54, v42, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v20, v19, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v16, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v44 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_bfe_u32 v1, v7, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v57, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_bfe_u32 v1, v6, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v11, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v5, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v40, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v45, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v55, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v53, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_bfe_u32 v1, v13, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v14, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_bfe_u32 v1, v15, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v18, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_bfe_u32 v1, v55, 8, 8 -; SI-NEXT: v_mov_b32_e32 v6, v42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v17, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v19, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v21, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v23, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v25, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v5, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v27, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v29, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 -; SI-NEXT: v_bfe_u32 v12, v6, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v6, v6, 8, 8 ; SI-NEXT: .LBB94_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182393,14 +178260,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v51 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182411,14 +178278,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182429,14 +178296,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v51 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182447,14 +178314,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v40 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182463,14 +178330,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182481,32 +178348,30 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182517,30 +178382,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182551,14 +178418,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182567,14 +178434,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182585,30 +178452,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v63 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182619,30 +178488,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v61 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182653,30 +178524,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v59 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182687,30 +178560,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182721,33 +178598,37 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -182755,30 +178636,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -182789,14 +178674,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -182807,14 +178692,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v45 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -182825,30 +178710,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -182859,47 +178746,55 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v46 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -186165,1805 +182060,1712 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-LABEL: bitcast_v64f16_to_v128i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v24, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v28, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v50 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s17 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v41 -; SI-NEXT: v_writelane_b32 v63, s30, 0 -; SI-NEXT: v_writelane_b32 v63, s31, 1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v47 -; SI-NEXT: v_writelane_b32 v63, s34, 2 -; SI-NEXT: v_writelane_b32 v63, s35, 3 -; SI-NEXT: v_writelane_b32 v63, s36, 4 -; SI-NEXT: v_writelane_b32 v63, s37, 5 -; SI-NEXT: v_writelane_b32 v63, s38, 6 -; SI-NEXT: v_writelane_b32 v63, s39, 7 -; SI-NEXT: v_writelane_b32 v63, s48, 8 -; SI-NEXT: v_writelane_b32 v63, s49, 9 -; SI-NEXT: v_writelane_b32 v63, s50, 10 -; SI-NEXT: v_writelane_b32 v63, s51, 11 -; SI-NEXT: v_writelane_b32 v63, s52, 12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_writelane_b32 v63, s53, 13 -; SI-NEXT: v_writelane_b32 v63, s54, 14 -; SI-NEXT: v_writelane_b32 v63, s55, 15 -; SI-NEXT: v_writelane_b32 v63, s64, 16 -; SI-NEXT: v_writelane_b32 v63, s65, 17 -; SI-NEXT: v_writelane_b32 v63, s66, 18 -; SI-NEXT: v_writelane_b32 v63, s67, 19 -; SI-NEXT: v_writelane_b32 v63, s68, 20 -; SI-NEXT: v_writelane_b32 v63, s69, 21 -; SI-NEXT: v_writelane_b32 v63, s70, 22 -; SI-NEXT: v_writelane_b32 v63, s71, 23 -; SI-NEXT: v_writelane_b32 v63, s80, 24 -; SI-NEXT: v_writelane_b32 v63, s81, 25 -; SI-NEXT: v_writelane_b32 v63, s82, 26 -; SI-NEXT: v_writelane_b32 v63, s83, 27 -; SI-NEXT: v_writelane_b32 v63, s84, 28 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s27 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s18 -; SI-NEXT: v_writelane_b32 v63, s85, 29 -; SI-NEXT: v_writelane_b32 v63, s86, 30 -; SI-NEXT: v_writelane_b32 v63, s87, 31 -; SI-NEXT: v_writelane_b32 v63, s96, 32 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: v_writelane_b32 v63, s97, 33 -; SI-NEXT: v_writelane_b32 v63, s98, 34 -; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v43, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_cbranch_scc0 .LBB95_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readfirstlane_b32 s4, v25 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v61 -; SI-NEXT: s_or_b32 s44, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v28 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v19 -; SI-NEXT: s_or_b32 s45, s5, s4 -; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 4 -; SI-NEXT: v_writelane_b32 v62, s5, 5 -; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 16 -; SI-NEXT: v_writelane_b32 v62, s4, 2 -; SI-NEXT: v_writelane_b32 v62, s5, 3 -; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 8 -; SI-NEXT: v_writelane_b32 v62, s4, 0 -; SI-NEXT: v_writelane_b32 v62, s5, 1 -; SI-NEXT: v_readfirstlane_b32 s4, v60 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v21 -; SI-NEXT: s_or_b32 s42, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v44 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v0 -; SI-NEXT: s_or_b32 s43, s5, s4 -; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 10 -; SI-NEXT: v_writelane_b32 v62, s5, 11 -; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 16 -; SI-NEXT: v_writelane_b32 v62, s4, 8 -; SI-NEXT: v_writelane_b32 v62, s5, 9 -; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 8 -; SI-NEXT: v_writelane_b32 v62, s4, 6 -; SI-NEXT: v_writelane_b32 v62, s5, 7 -; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v41 -; SI-NEXT: s_or_b32 s40, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v54 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v16 -; SI-NEXT: s_or_b32 s41, s5, s4 -; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 16 -; SI-NEXT: v_writelane_b32 v62, s5, 17 -; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 16 -; SI-NEXT: v_writelane_b32 v62, s4, 14 -; SI-NEXT: v_writelane_b32 v62, s5, 15 -; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 8 -; SI-NEXT: v_writelane_b32 v62, s4, 12 -; SI-NEXT: v_writelane_b32 v62, s5, 13 -; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v48 -; SI-NEXT: s_or_b32 s28, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v52 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v22 -; SI-NEXT: s_or_b32 s29, s5, s4 -; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 22 -; SI-NEXT: v_writelane_b32 v62, s5, 23 -; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 -; SI-NEXT: v_writelane_b32 v62, s4, 20 -; SI-NEXT: v_writelane_b32 v62, s5, 21 -; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 8 -; SI-NEXT: v_writelane_b32 v62, s4, 18 -; SI-NEXT: v_writelane_b32 v62, s5, 19 -; SI-NEXT: v_readfirstlane_b32 s4, v14 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v49 -; SI-NEXT: s_or_b32 s26, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v45 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v13 -; SI-NEXT: s_or_b32 s27, s5, s4 -; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 28 -; SI-NEXT: v_writelane_b32 v62, s5, 29 -; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 -; SI-NEXT: v_writelane_b32 v62, s4, 26 -; SI-NEXT: v_writelane_b32 v62, s5, 27 -; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 8 -; SI-NEXT: v_writelane_b32 v62, s4, 24 -; SI-NEXT: v_writelane_b32 v62, s5, 25 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v40 -; SI-NEXT: s_or_b32 s24, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v12 -; SI-NEXT: v_mov_b32_e32 v13, v9 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v13 -; SI-NEXT: s_or_b32 s25, s5, s4 -; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 34 -; SI-NEXT: v_writelane_b32 v62, s5, 35 -; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 -; SI-NEXT: v_writelane_b32 v62, s4, 32 -; SI-NEXT: v_writelane_b32 v62, s5, 33 -; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 8 -; SI-NEXT: v_writelane_b32 v62, s4, 30 -; SI-NEXT: v_writelane_b32 v62, s5, 31 -; SI-NEXT: v_readfirstlane_b32 s4, v56 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v23 -; SI-NEXT: s_or_b32 s22, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v11 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v50 -; SI-NEXT: s_or_b32 s23, s5, s4 -; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 40 -; SI-NEXT: v_writelane_b32 v62, s5, 41 -; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 -; SI-NEXT: v_writelane_b32 v62, s4, 38 -; SI-NEXT: v_writelane_b32 v62, s5, 39 -; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 8 -; SI-NEXT: v_writelane_b32 v62, s4, 36 -; SI-NEXT: v_writelane_b32 v62, s5, 37 -; SI-NEXT: v_readfirstlane_b32 s4, v58 -; SI-NEXT: v_mov_b32_e32 v9, v51 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v9 -; SI-NEXT: s_or_b32 s20, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v10 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v29 -; SI-NEXT: s_or_b32 s21, s5, s4 -; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 44 -; SI-NEXT: v_writelane_b32 v62, s5, 45 -; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 -; SI-NEXT: v_writelane_b32 v62, s4, 42 -; SI-NEXT: v_writelane_b32 v62, s5, 43 -; SI-NEXT: v_readfirstlane_b32 s4, v20 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v35 -; SI-NEXT: s_or_b32 s18, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v8 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v31 -; SI-NEXT: s_or_b32 s19, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v17 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v26 -; SI-NEXT: s_or_b32 s16, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v32 -; SI-NEXT: s_or_b32 s17, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v4 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v37 -; SI-NEXT: s_or_b32 s14, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v7 -; SI-NEXT: v_mov_b32_e32 v51, v15 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v51 -; SI-NEXT: s_or_b32 s15, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v33 -; SI-NEXT: s_or_b32 s12, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v43 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v42 -; SI-NEXT: s_or_b32 s13, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v6 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v39 -; SI-NEXT: s_or_b32 s10, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v3 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v55 -; SI-NEXT: s_or_b32 s11, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v57 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v30 -; SI-NEXT: s_or_b32 s8, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v5 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v59 -; SI-NEXT: s_or_b32 s9, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v46 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v27 -; SI-NEXT: s_or_b32 s6, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v53 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v47 -; SI-NEXT: s_or_b32 s7, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s46, v1 -; SI-NEXT: v_mov_b32_e32 v38, v48 -; SI-NEXT: v_mov_b32_e32 v2, v40 -; SI-NEXT: v_mov_b32_e32 v56, v23 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v6, v39 -; SI-NEXT: s_lshr_b32 s71, s45, 8 -; SI-NEXT: s_lshr_b32 s70, s43, 8 -; SI-NEXT: s_lshr_b32 s69, s41, 8 -; SI-NEXT: s_lshr_b32 s68, s29, 8 -; SI-NEXT: s_lshr_b32 s66, s27, 8 -; SI-NEXT: s_lshr_b32 s64, s25, 8 -; SI-NEXT: s_lshr_b32 s54, s23, 8 -; SI-NEXT: s_lshr_b32 s52, s21, 8 -; SI-NEXT: s_lshr_b32 s50, s19, 8 -; SI-NEXT: s_lshr_b32 s48, s17, 8 -; SI-NEXT: s_lshr_b32 s67, s15, 8 -; SI-NEXT: s_lshr_b32 s65, s13, 8 -; SI-NEXT: s_lshr_b32 s55, s11, 8 -; SI-NEXT: s_lshr_b32 s53, s9, 8 -; SI-NEXT: s_lshr_b32 s51, s7, 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[96:97], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[98:99], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 -; SI-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[14:15], 8 -; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[10:11], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[86:87], s[6:7], 8 -; SI-NEXT: v_mov_b32_e32 v47, v28 -; SI-NEXT: v_bfe_u32 v35, v28, 8, 8 -; SI-NEXT: v_mov_b32_e32 v50, v44 -; SI-NEXT: v_bfe_u32 v32, v44, 8, 8 -; SI-NEXT: v_mov_b32_e32 v44, v54 -; SI-NEXT: v_bfe_u32 v31, v54, 8, 8 -; SI-NEXT: v_mov_b32_e32 v54, v52 -; SI-NEXT: v_bfe_u32 v29, v52, 8, 8 -; SI-NEXT: v_mov_b32_e32 v52, v45 -; SI-NEXT: v_bfe_u32 v49, v45, 8, 8 -; SI-NEXT: v_bfe_u32 v25, v12, 8, 8 -; SI-NEXT: v_mov_b32_e32 v12, v11 -; SI-NEXT: v_bfe_u32 v21, v11, 8, 8 -; SI-NEXT: v_bfe_u32 v16, v10, 8, 8 -; SI-NEXT: v_mov_b32_e32 v10, v8 -; SI-NEXT: v_bfe_u32 v0, v8, 8, 8 -; SI-NEXT: v_mov_b32_e32 v45, v36 -; SI-NEXT: v_bfe_u32 v19, v36, 8, 8 -; SI-NEXT: v_mov_b32_e32 v36, v7 -; SI-NEXT: v_bfe_u32 v40, v7, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s4, v57 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s5, v15 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s5, v14 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_or_b32 s5, s46, s5 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 48 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 49 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 8 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 46 -; SI-NEXT: s_lshr_b32 s49, s5, 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 16 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 47 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: v_mov_b32_e32 v7, v43 -; SI-NEXT: v_bfe_u32 v61, v43, 8, 8 -; SI-NEXT: v_mov_b32_e32 v43, v3 -; SI-NEXT: v_bfe_u32 v60, v3, 8, 8 -; SI-NEXT: v_bfe_u32 v41, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v48, v53, 8, 8 -; SI-NEXT: v_bfe_u32 v42, v14, 8, 8 -; SI-NEXT: v_mov_b32_e32 v4, v22 -; SI-NEXT: v_mov_b32_e32 v14, v13 -; SI-NEXT: v_mov_b32_e32 v20, v53 -; SI-NEXT: v_mov_b32_e32 v53, v5 -; SI-NEXT: v_mov_b32_e32 v39, v9 -; SI-NEXT: v_mov_b32_e32 v18, v59 -; SI-NEXT: s_branch .LBB95_3 -; SI-NEXT: .LBB95_2: -; SI-NEXT: v_mov_b32_e32 v6, v39 -; SI-NEXT: v_mov_b32_e32 v39, v51 -; SI-NEXT: v_mov_b32_e32 v51, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_writelane_b32 v62, s4, 0 -; SI-NEXT: v_writelane_b32 v62, s5, 1 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v47, v28 -; SI-NEXT: v_writelane_b32 v62, s4, 2 -; SI-NEXT: v_writelane_b32 v62, s5, 3 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v56, v23 -; SI-NEXT: v_writelane_b32 v62, s4, 4 -; SI-NEXT: v_writelane_b32 v62, s5, 5 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v50, v44 -; SI-NEXT: v_writelane_b32 v62, s4, 6 -; SI-NEXT: v_writelane_b32 v62, s5, 7 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v44, v54 -; SI-NEXT: v_writelane_b32 v62, s4, 8 -; SI-NEXT: v_writelane_b32 v62, s5, 9 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v54, v52 -; SI-NEXT: v_writelane_b32 v62, s4, 10 -; SI-NEXT: v_writelane_b32 v62, s5, 11 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v52, v45 -; SI-NEXT: v_writelane_b32 v62, s4, 12 -; SI-NEXT: v_writelane_b32 v62, s5, 13 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v12, v11 -; SI-NEXT: v_writelane_b32 v62, s4, 14 -; SI-NEXT: v_writelane_b32 v62, s5, 15 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v10, v8 -; SI-NEXT: v_writelane_b32 v62, s4, 16 -; SI-NEXT: v_writelane_b32 v62, s5, 17 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v45, v36 -; SI-NEXT: v_writelane_b32 v62, s4, 18 -; SI-NEXT: v_writelane_b32 v62, s5, 19 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v36, v7 -; SI-NEXT: v_writelane_b32 v62, s4, 20 -; SI-NEXT: v_writelane_b32 v62, s5, 21 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v7, v43 -; SI-NEXT: v_writelane_b32 v62, s4, 22 -; SI-NEXT: v_writelane_b32 v62, s5, 23 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v43, v3 -; SI-NEXT: v_writelane_b32 v62, s4, 24 -; SI-NEXT: v_writelane_b32 v62, s5, 25 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v2, v40 -; SI-NEXT: v_writelane_b32 v62, s4, 26 -; SI-NEXT: v_writelane_b32 v62, s5, 27 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v38, v48 -; SI-NEXT: v_writelane_b32 v62, s4, 28 -; SI-NEXT: v_writelane_b32 v62, s5, 29 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_mov_b64 vcc, -1 -; SI-NEXT: v_writelane_b32 v62, s4, 30 -; SI-NEXT: v_writelane_b32 v62, s5, 31 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v4, v22 -; SI-NEXT: v_writelane_b32 v62, s4, 32 -; SI-NEXT: v_writelane_b32 v62, s5, 33 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v14, v9 -; SI-NEXT: v_writelane_b32 v62, s4, 34 -; SI-NEXT: v_writelane_b32 v62, s5, 35 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v20, v53 -; SI-NEXT: v_writelane_b32 v62, s4, 36 -; SI-NEXT: v_writelane_b32 v62, s5, 37 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v53, v5 -; SI-NEXT: v_writelane_b32 v62, s4, 38 -; SI-NEXT: v_writelane_b32 v62, s5, 39 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_writelane_b32 v62, s4, 40 -; SI-NEXT: v_writelane_b32 v62, s5, 41 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v18, v59 -; SI-NEXT: v_writelane_b32 v62, s4, 42 -; SI-NEXT: v_writelane_b32 v62, s5, 43 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: v_writelane_b32 v62, s4, 44 -; SI-NEXT: v_writelane_b32 v62, s5, 45 -; SI-NEXT: v_writelane_b32 v62, s80, 46 -; SI-NEXT: v_writelane_b32 v62, s81, 47 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v62, s80, 48 -; SI-NEXT: v_writelane_b32 v62, s81, 49 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: .LBB95_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v13, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v9, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, vcc -; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: v_mov_b32_e32 v7, v10 -; SI-NEXT: v_mov_b32_e32 v8, v12 -; SI-NEXT: s_cbranch_vccnz .LBB95_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: v_mov_b32_e32 v16, v4 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_writelane_b32 v34, s30, 0 +; SI-NEXT: v_writelane_b32 v34, s31, 1 +; SI-NEXT: v_writelane_b32 v34, s34, 2 +; SI-NEXT: v_writelane_b32 v34, s35, 3 +; SI-NEXT: v_writelane_b32 v34, s36, 4 +; SI-NEXT: v_writelane_b32 v34, s37, 5 +; SI-NEXT: v_writelane_b32 v34, s38, 6 +; SI-NEXT: v_writelane_b32 v34, s39, 7 +; SI-NEXT: v_writelane_b32 v34, s48, 8 +; SI-NEXT: v_writelane_b32 v34, s49, 9 +; SI-NEXT: v_writelane_b32 v34, s50, 10 +; SI-NEXT: v_writelane_b32 v34, s51, 11 +; SI-NEXT: v_writelane_b32 v34, s52, 12 +; SI-NEXT: v_writelane_b32 v34, s53, 13 +; SI-NEXT: v_writelane_b32 v34, s54, 14 +; SI-NEXT: v_writelane_b32 v34, s55, 15 +; SI-NEXT: v_writelane_b32 v34, s64, 16 +; SI-NEXT: v_writelane_b32 v34, s65, 17 +; SI-NEXT: v_writelane_b32 v34, s66, 18 +; SI-NEXT: v_writelane_b32 v34, s67, 19 +; SI-NEXT: v_writelane_b32 v34, s68, 20 +; SI-NEXT: v_writelane_b32 v34, s69, 21 +; SI-NEXT: v_writelane_b32 v34, s70, 22 +; SI-NEXT: s_lshr_b32 s6, s20, 16 +; SI-NEXT: ; implicit-def: $vgpr37 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v34, s71, 23 +; SI-NEXT: s_lshr_b32 s7, s22, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v37, s6, 0 +; SI-NEXT: v_writelane_b32 v34, s80, 24 +; SI-NEXT: s_lshr_b32 s8, s24, 16 +; SI-NEXT: v_writelane_b32 v37, s7, 2 +; SI-NEXT: v_writelane_b32 v34, s81, 25 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: v_writelane_b32 v37, s8, 4 +; SI-NEXT: v_writelane_b32 v34, s82, 26 +; SI-NEXT: s_lshr_b32 s10, s28, 16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_writelane_b32 v37, s9, 6 +; SI-NEXT: v_writelane_b32 v34, s83, 27 +; SI-NEXT: v_readfirstlane_b32 s11, v20 +; SI-NEXT: v_writelane_b32 v37, s10, 8 +; SI-NEXT: v_writelane_b32 v34, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s12, v3 +; SI-NEXT: v_writelane_b32 v37, s11, 10 +; SI-NEXT: v_writelane_b32 v34, s85, 29 +; SI-NEXT: v_readfirstlane_b32 s13, v8 +; SI-NEXT: v_writelane_b32 v37, s12, 11 +; SI-NEXT: v_writelane_b32 v34, s86, 30 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_readfirstlane_b32 s71, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_writelane_b32 v37, s13, 12 +; SI-NEXT: v_writelane_b32 v34, s87, 31 +; SI-NEXT: v_readfirstlane_b32 s15, v4 +; SI-NEXT: v_writelane_b32 v37, s14, 13 +; SI-NEXT: v_writelane_b32 v34, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s93, v10 +; SI-NEXT: v_writelane_b32 v37, s15, 14 +; SI-NEXT: v_writelane_b32 v34, s97, 33 +; SI-NEXT: v_readfirstlane_b32 s34, v12 +; SI-NEXT: v_writelane_b32 v37, s93, 15 +; SI-NEXT: v_writelane_b32 v34, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s38, v11 +; SI-NEXT: v_writelane_b32 v37, s34, 16 +; SI-NEXT: v_writelane_b32 v34, s99, 35 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_lshr_b32 s90, s29, 16 +; SI-NEXT: s_lshr_b32 s89, s27, 16 +; SI-NEXT: s_lshr_b32 s88, s25, 16 +; SI-NEXT: s_lshr_b32 s79, s23, 16 +; SI-NEXT: s_lshr_b32 s78, s21, 16 +; SI-NEXT: s_lshr_b32 s77, s19, 16 +; SI-NEXT: s_lshr_b32 s99, s18, 16 +; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: s_lshr_b32 s97, s16, 16 +; SI-NEXT: v_writelane_b32 v37, s38, 17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 vcc_lo, v7 +; SI-NEXT: v_writelane_b32 v37, s18, 18 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 19 +; SI-NEXT: v_readfirstlane_b32 s94, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_readfirstlane_b32 s64, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_readfirstlane_b32 s50, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_readfirstlane_b32 s53, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_readfirstlane_b32 s49, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_readfirstlane_b32 s37, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_readfirstlane_b32 s96, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_readfirstlane_b32 s51, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_readfirstlane_b32 s81, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_writelane_b32 v37, s21, 20 +; SI-NEXT: v_readfirstlane_b32 s35, v2 +; SI-NEXT: v_readfirstlane_b32 s39, v1 +; SI-NEXT: v_readfirstlane_b32 s95, v18 +; SI-NEXT: v_readfirstlane_b32 s68, v17 +; SI-NEXT: v_readfirstlane_b32 s66, v16 +; SI-NEXT: v_readfirstlane_b32 s67, v15 +; SI-NEXT: v_readfirstlane_b32 s55, v14 +; SI-NEXT: v_readfirstlane_b32 s65, v13 +; SI-NEXT: v_readfirstlane_b32 s52, v12 +; SI-NEXT: v_readfirstlane_b32 s54, v11 +; SI-NEXT: v_readfirstlane_b32 s48, v10 +; SI-NEXT: v_readfirstlane_b32 vcc_hi, v9 +; SI-NEXT: v_readfirstlane_b32 s36, v8 +; SI-NEXT: v_readfirstlane_b32 s31, v6 +; SI-NEXT: v_readfirstlane_b32 s98, v5 +; SI-NEXT: v_readfirstlane_b32 s92, v19 +; SI-NEXT: v_readfirstlane_b32 s91, v3 +; SI-NEXT: v_writelane_b32 v37, s20, 21 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 22 +; SI-NEXT: ; implicit-def: $vgpr36 : SGPR spill to VGPR lane +; SI-NEXT: ; implicit-def: $vgpr35 : SGPR spill to VGPR lane +; SI-NEXT: s_cbranch_scc0 .LBB95_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s97, 16 +; SI-NEXT: s_or_b32 s74, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: s_or_b32 s75, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s99, 16 +; SI-NEXT: s_or_b32 s72, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s77, 16 +; SI-NEXT: s_or_b32 s73, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s60, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s78, 16 +; SI-NEXT: s_or_b32 s61, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s56, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s79, 16 +; SI-NEXT: s_or_b32 s57, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s62, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s88, 16 +; SI-NEXT: s_or_b32 s63, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s58, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s89, 16 +; SI-NEXT: s_or_b32 s59, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s90, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_and_b32 s4, s39, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s35, 0xffff +; SI-NEXT: s_lshl_b32 s5, s91, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s71, 0xffff +; SI-NEXT: s_lshl_b32 s5, s92, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s81, 0xffff +; SI-NEXT: s_lshl_b32 s5, s98, 16 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s51, 0xffff +; SI-NEXT: s_lshl_b32 s5, s31, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, vcc_lo, 16 +; SI-NEXT: s_or_b32 s14, s4, s5 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: s_lshl_b32 s5, s36, 16 +; SI-NEXT: s_or_b32 s15, s4, s5 +; SI-NEXT: s_and_b32 s4, s96, 0xffff +; SI-NEXT: s_lshl_b32 s5, vcc_hi, 16 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s4, s93, 0xffff +; SI-NEXT: s_lshl_b32 s5, s48, 16 +; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: s_and_b32 s4, s38, 0xffff +; SI-NEXT: s_lshl_b32 s5, s54, 16 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s34, 0xffff +; SI-NEXT: s_lshl_b32 s5, s52, 16 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s37, 0xffff +; SI-NEXT: s_lshl_b32 s5, s65, 16 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s49, 0xffff +; SI-NEXT: s_lshl_b32 s5, s55, 16 +; SI-NEXT: s_lshr_b32 s93, s75, 8 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s53, 0xffff +; SI-NEXT: s_lshl_b32 s5, s67, 16 +; SI-NEXT: v_writelane_b32 v35, s93, 1 +; SI-NEXT: s_lshr_b32 s93, s73, 8 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s50, 0xffff +; SI-NEXT: s_lshl_b32 s5, s66, 16 +; SI-NEXT: v_writelane_b32 v35, s93, 0 +; SI-NEXT: s_lshr_b32 s93, s61, 8 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s64, 0xffff +; SI-NEXT: s_lshl_b32 s5, s68, 16 +; SI-NEXT: v_writelane_b32 v36, s93, 63 +; SI-NEXT: s_lshr_b32 s93, s57, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s94, 0xffff +; SI-NEXT: s_lshl_b32 vcc_lo, s95, 16 +; SI-NEXT: v_writelane_b32 v36, s93, 62 +; SI-NEXT: s_lshr_b32 s93, s63, 8 +; SI-NEXT: s_or_b32 s5, s5, vcc_lo +; SI-NEXT: v_writelane_b32 v36, s93, 61 +; SI-NEXT: s_lshr_b32 vcc_lo, s59, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 60 +; SI-NEXT: s_lshr_b32 vcc_lo, s47, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 59 +; SI-NEXT: s_lshr_b32 vcc_lo, s45, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 58 +; SI-NEXT: s_lshr_b32 vcc_lo, s43, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 57 +; SI-NEXT: s_lshr_b32 vcc_lo, s41, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 56 +; SI-NEXT: s_lshr_b32 vcc_lo, s15, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 55 +; SI-NEXT: s_lshr_b32 vcc_lo, s13, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 54 +; SI-NEXT: s_lshr_b32 vcc_lo, s11, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 53 +; SI-NEXT: s_lshr_b32 vcc_lo, s9, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 52 +; SI-NEXT: s_lshr_b32 vcc_lo, s7, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 51 +; SI-NEXT: s_lshr_b32 vcc_lo, s5, 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 50 +; SI-NEXT: s_lshr_b64 vcc, s[74:75], 24 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 27 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 28 +; SI-NEXT: s_lshr_b64 vcc, s[74:75], 16 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 25 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 26 +; SI-NEXT: s_lshr_b64 vcc, s[74:75], 8 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 23 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 24 +; SI-NEXT: s_lshr_b64 vcc, s[72:73], 24 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 33 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 34 +; SI-NEXT: s_lshr_b64 vcc, s[72:73], 16 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 31 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 32 +; SI-NEXT: s_lshr_b64 vcc, s[72:73], 8 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 29 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 30 +; SI-NEXT: s_lshr_b64 vcc, s[60:61], 24 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 39 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 40 +; SI-NEXT: s_lshr_b64 vcc, s[60:61], 16 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 37 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 38 +; SI-NEXT: s_lshr_b64 vcc, s[60:61], 8 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 35 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 36 +; SI-NEXT: s_lshr_b64 vcc, s[56:57], 24 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 45 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 46 +; SI-NEXT: s_lshr_b64 vcc, s[56:57], 16 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 43 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 44 +; SI-NEXT: s_lshr_b64 vcc, s[56:57], 8 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 41 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 42 +; SI-NEXT: s_lshr_b64 vcc, s[62:63], 24 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 51 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 52 +; SI-NEXT: s_lshr_b64 vcc, s[62:63], 16 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 49 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 50 +; SI-NEXT: s_lshr_b64 vcc, s[62:63], 8 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 47 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 48 +; SI-NEXT: s_lshr_b64 vcc, s[58:59], 24 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 57 +; SI-NEXT: s_bfe_u32 s93, s78, 0x80008 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 58 +; SI-NEXT: s_lshr_b64 vcc, s[58:59], 16 +; SI-NEXT: v_writelane_b32 v36, s93, 49 +; SI-NEXT: s_bfe_u32 s93, s79, 0x80008 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 55 +; SI-NEXT: v_writelane_b32 v36, s93, 48 +; SI-NEXT: s_bfe_u32 s93, s88, 0x80008 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 56 +; SI-NEXT: s_lshr_b64 vcc, s[58:59], 8 +; SI-NEXT: v_writelane_b32 v36, s93, 47 +; SI-NEXT: s_bfe_u32 s93, s89, 0x80008 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 53 +; SI-NEXT: v_writelane_b32 v36, s93, 46 +; SI-NEXT: s_bfe_u32 s93, s90, 0x80008 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 54 +; SI-NEXT: s_lshr_b64 vcc, s[46:47], 24 +; SI-NEXT: v_writelane_b32 v36, s93, 45 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 63 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 0 +; SI-NEXT: s_lshr_b64 vcc, s[46:47], 16 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 61 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 62 +; SI-NEXT: s_lshr_b64 vcc, s[46:47], 8 +; SI-NEXT: v_writelane_b32 v37, vcc_lo, 59 +; SI-NEXT: v_writelane_b32 v37, vcc_hi, 60 +; SI-NEXT: s_lshr_b64 vcc, s[44:45], 24 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 5 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 6 +; SI-NEXT: s_mov_b32 vcc_lo, s97 +; SI-NEXT: s_mov_b32 vcc_hi, s96 +; SI-NEXT: s_lshr_b64 s[96:97], s[44:45], 16 +; SI-NEXT: v_writelane_b32 v36, s96, 3 +; SI-NEXT: v_writelane_b32 v36, s97, 4 +; SI-NEXT: s_mov_b32 s96, vcc_hi +; SI-NEXT: s_mov_b32 s97, vcc_lo +; SI-NEXT: s_mov_b32 vcc_lo, s99 +; SI-NEXT: s_mov_b32 vcc_hi, s98 +; SI-NEXT: s_lshr_b64 s[98:99], s[44:45], 8 +; SI-NEXT: v_writelane_b32 v36, s98, 1 +; SI-NEXT: v_writelane_b32 v36, s99, 2 +; SI-NEXT: s_mov_b32 s98, vcc_hi +; SI-NEXT: s_mov_b32 s99, vcc_lo +; SI-NEXT: s_lshr_b64 vcc, s[42:43], 24 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 11 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 12 +; SI-NEXT: s_lshr_b64 vcc, s[42:43], 16 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 9 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 10 +; SI-NEXT: s_lshr_b64 vcc, s[42:43], 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 7 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 8 +; SI-NEXT: s_lshr_b64 vcc, s[40:41], 24 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 17 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 18 +; SI-NEXT: s_lshr_b64 vcc, s[40:41], 16 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 15 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 16 +; SI-NEXT: s_lshr_b64 vcc, s[40:41], 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 13 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 14 +; SI-NEXT: s_lshr_b64 vcc, s[14:15], 24 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 23 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 24 +; SI-NEXT: s_lshr_b64 vcc, s[14:15], 16 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 21 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 22 +; SI-NEXT: s_lshr_b64 vcc, s[14:15], 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 19 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 20 +; SI-NEXT: s_lshr_b64 vcc, s[12:13], 24 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 29 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 30 +; SI-NEXT: s_lshr_b64 vcc, s[12:13], 16 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 27 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 28 +; SI-NEXT: s_lshr_b64 vcc, s[12:13], 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 25 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 26 +; SI-NEXT: s_lshr_b64 vcc, s[10:11], 24 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 35 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 36 +; SI-NEXT: s_lshr_b64 vcc, s[10:11], 16 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 33 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 34 +; SI-NEXT: s_lshr_b64 vcc, s[10:11], 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 31 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 32 +; SI-NEXT: s_lshr_b64 vcc, s[8:9], 24 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 41 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 42 +; SI-NEXT: s_lshr_b64 vcc, s[8:9], 16 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 39 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 40 +; SI-NEXT: s_lshr_b64 vcc, s[8:9], 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 37 +; SI-NEXT: s_mov_b32 s30, s53 +; SI-NEXT: s_mov_b32 s53, s50 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 38 +; SI-NEXT: s_mov_b32 vcc_lo, s51 +; SI-NEXT: s_lshr_b64 s[50:51], s[6:7], 24 +; SI-NEXT: s_mov_b32 s51, vcc_lo +; SI-NEXT: s_mov_b32 vcc_lo, s81 +; SI-NEXT: s_lshr_b64 s[80:81], s[6:7], 16 +; SI-NEXT: s_mov_b32 s81, vcc_lo +; SI-NEXT: s_lshr_b64 vcc, s[6:7], 8 +; SI-NEXT: v_writelane_b32 v36, vcc_lo, 43 +; SI-NEXT: v_writelane_b32 v36, vcc_hi, 44 +; SI-NEXT: s_mov_b32 vcc_lo, s35 +; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 24 +; SI-NEXT: s_mov_b32 s35, vcc_lo +; SI-NEXT: s_mov_b32 vcc_lo, s71 +; SI-NEXT: s_lshr_b64 s[70:71], s[4:5], 16 +; SI-NEXT: s_mov_b32 s71, vcc_lo +; SI-NEXT: s_mov_b32 vcc_lo, s39 +; SI-NEXT: s_lshr_b64 s[38:39], s[4:5], 8 +; SI-NEXT: s_mov_b32 s20, s37 +; SI-NEXT: s_mov_b32 s37, s49 +; SI-NEXT: s_bfe_u32 s18, s76, 0x80008 +; SI-NEXT: s_bfe_u32 s21, s77, 0x80008 +; SI-NEXT: s_bfe_u32 s93, s91, 0x80008 +; SI-NEXT: s_bfe_u32 s49, s92, 0x80008 +; SI-NEXT: s_bfe_u32 s82, s31, 0x80008 +; SI-NEXT: s_bfe_u32 s83, s36, 0x80008 +; SI-NEXT: s_bfe_u32 s84, s48, 0x80008 +; SI-NEXT: s_bfe_u32 s85, s52, 0x80008 +; SI-NEXT: s_bfe_u32 s69, s55, 0x80008 +; SI-NEXT: s_bfe_u32 s86, s66, 0x80008 +; SI-NEXT: s_bfe_u32 s87, s95, 0x80008 +; SI-NEXT: s_mov_b32 s39, vcc_lo +; SI-NEXT: s_cbranch_execnz .LBB95_4 +; SI-NEXT: .LBB95_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v2, s64 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s68 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s95 +; SI-NEXT: v_readlane_b32 s11, v37, 17 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_readfirstlane_b32 s5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s94 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s67 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s5, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s30 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_readfirstlane_b32 s5, v15 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s5, v1 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_bfe_u32 v42, v15, 8, 8 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_lshr_b32 s49, s5, 8 -; SI-NEXT: v_readfirstlane_b32 s6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s66 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s7, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s53 +; SI-NEXT: v_readfirstlane_b32 s7, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s65 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s7, v20 +; SI-NEXT: v_readfirstlane_b32 s7, v2 ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: v_bfe_u32 v48, v20, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s8, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_lshr_b64 s[86:87], s[6:7], 8 -; SI-NEXT: s_lshr_b32 s51, s7, 8 -; SI-NEXT: v_readfirstlane_b32 s8, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s55 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s9, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s37 +; SI-NEXT: v_readfirstlane_b32 s9, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s54 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_readfirstlane_b32 s9, v53 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_readfirstlane_b32 s9, v3 ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: v_bfe_u32 v41, v53, 8, 8 -; SI-NEXT: v_readfirstlane_b32 s10, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 -; SI-NEXT: s_lshr_b32 s53, s9, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s10, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_readfirstlane_b32 s10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s52 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 +; SI-NEXT: v_readlane_b32 s12, v37, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s12 +; SI-NEXT: v_readlane_b32 s12, v37, 22 +; SI-NEXT: v_readfirstlane_b32 s11, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s11, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s11, v43 +; SI-NEXT: v_readfirstlane_b32 s11, v4 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: v_bfe_u32 v60, v43, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s12, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_readfirstlane_b32 s12, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s96 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_lshr_b64 s[76:77], s[10:11], 8 -; SI-NEXT: s_lshr_b32 s55, s11, 8 -; SI-NEXT: v_readfirstlane_b32 s12, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s48 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_readlane_b32 s14, v37, 15 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s14 +; SI-NEXT: v_readlane_b32 s14, v37, 19 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s14 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_readlane_b32 s15, v37, 13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s15 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s13, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_readfirstlane_b32 s13, v5 ; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: v_bfe_u32 v61, v5, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s14, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[12:13], 8 -; SI-NEXT: s_lshr_b32 s65, s13, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s14, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s36 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 +; SI-NEXT: v_readlane_b32 s40, v37, 12 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s40 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s98 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s15, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_readfirstlane_b32 s15, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_readfirstlane_b32 s15, v6 ; SI-NEXT: s_lshl_b32 s15, s15, 16 -; SI-NEXT: v_mov_b32_e32 v36, v1 -; SI-NEXT: v_bfe_u32 v40, v1, 8, 8 -; SI-NEXT: v_readfirstlane_b32 s16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[14:15], 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_lshr_b64 s[30:31], s[14:15], 8 -; SI-NEXT: s_lshr_b32 s67, s15, 8 -; SI-NEXT: v_readfirstlane_b32 s16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s17, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s17, v6 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: v_mov_b32_e32 v45, v6 -; SI-NEXT: v_bfe_u32 v19, v6, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s18, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 -; SI-NEXT: s_lshr_b32 s48, s17, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s18, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s19, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s19, v7 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s20, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: s_lshr_b64 s[96:97], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[98:99], s[18:19], 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 8 -; SI-NEXT: s_lshr_b32 s50, s19, 8 -; SI-NEXT: v_readfirstlane_b32 s20, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s21, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s21, v4 -; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s22, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_or_b32 s21, s22, s21 -; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 8 -; SI-NEXT: s_lshr_b32 s52, s21, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: s_lshl_b32 s22, s22, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s23, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: s_or_b32 s22, s23, s22 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s23, v8 -; SI-NEXT: s_lshl_b32 s23, s23, 16 -; SI-NEXT: v_bfe_u32 v21, v8, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s24, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: s_or_b32 s23, s24, s23 -; SI-NEXT: s_lshr_b32 s54, s23, 8 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s24, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: s_lshl_b32 s24, s24, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s25, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_readfirstlane_b32 s25, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_lshl_b32 s25, s25, 16 -; SI-NEXT: v_bfe_u32 v25, v2, 8, 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s26, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: s_or_b32 s25, s26, s25 -; SI-NEXT: s_lshr_b32 s64, s25, 8 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s26, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s27, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: s_or_b32 s26, s27, s26 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s27, v52 -; SI-NEXT: s_lshl_b32 s27, s27, 16 -; SI-NEXT: v_bfe_u32 v49, v52, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s28, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: s_or_b32 s27, s28, s27 -; SI-NEXT: s_lshr_b32 s66, s27, 8 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s28, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: s_lshl_b32 s28, s28, 16 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s29, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: s_or_b32 s28, s29, s28 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_bfe_u32 v16, v4, 8, 8 -; SI-NEXT: v_readfirstlane_b32 s29, v54 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_lshl_b32 s29, s29, 16 -; SI-NEXT: v_bfe_u32 v29, v54, 8, 8 -; SI-NEXT: v_readfirstlane_b32 s40, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: s_or_b32 s29, s40, s29 -; SI-NEXT: s_lshr_b32 s68, s29, 8 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s40, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s40, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s81 +; SI-NEXT: s_or_b32 s15, s40, s15 +; SI-NEXT: v_readfirstlane_b32 s40, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s31 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 +; SI-NEXT: v_readlane_b32 s42, v37, 14 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s51 +; SI-NEXT: v_readfirstlane_b32 s41, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s42 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_readlane_b32 s43, v37, 11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_readfirstlane_b32 s42, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s43 ; SI-NEXT: s_lshl_b32 s40, s40, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s41, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 ; SI-NEXT: s_or_b32 s40, s41, s40 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s41, v44 +; SI-NEXT: v_readfirstlane_b32 s41, v7 ; SI-NEXT: s_lshl_b32 s41, s41, 16 -; SI-NEXT: v_bfe_u32 v31, v44, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s42, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s41, s42, s41 -; SI-NEXT: s_lshr_b32 s69, s41, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s42, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s92 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 +; SI-NEXT: v_readlane_b32 s44, v37, 10 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s71 +; SI-NEXT: v_readfirstlane_b32 s43, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s44 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: s_lshl_b32 s42, s42, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s43, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 ; SI-NEXT: s_or_b32 s42, s43, s42 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s43, v50 +; SI-NEXT: v_readfirstlane_b32 s43, v8 ; SI-NEXT: s_lshl_b32 s43, s43, 16 -; SI-NEXT: v_bfe_u32 v32, v50, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s44, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s44, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s39 ; SI-NEXT: s_or_b32 s43, s44, s43 -; SI-NEXT: s_lshr_b32 s70, s43, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s44, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s44, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s91 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_readlane_b32 s46, v37, 8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s35 +; SI-NEXT: v_readfirstlane_b32 s45, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s46 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: s_lshl_b32 s44, s44, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s45, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: v_readfirstlane_b32 s46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s28 +; SI-NEXT: v_readfirstlane_b32 s28, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s90 ; SI-NEXT: s_or_b32 s44, s45, s44 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s45, v47 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_readfirstlane_b32 s45, v9 ; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: v_bfe_u32 v35, v47, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s46, v0 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 +; SI-NEXT: v_readlane_b32 s29, v37, 6 ; SI-NEXT: s_or_b32 s45, s46, s45 -; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 4 -; SI-NEXT: v_writelane_b32 v62, s47, 5 -; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 16 -; SI-NEXT: v_writelane_b32 v62, s46, 2 -; SI-NEXT: v_writelane_b32 v62, s47, 3 -; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 8 -; SI-NEXT: v_writelane_b32 v62, s46, 0 -; SI-NEXT: v_writelane_b32 v62, s47, 1 -; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 10 -; SI-NEXT: v_writelane_b32 v62, s47, 11 -; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 16 -; SI-NEXT: v_writelane_b32 v62, s46, 8 -; SI-NEXT: v_writelane_b32 v62, s47, 9 -; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 8 -; SI-NEXT: v_writelane_b32 v62, s46, 6 -; SI-NEXT: v_writelane_b32 v62, s47, 7 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 16 -; SI-NEXT: v_writelane_b32 v62, s47, 17 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 16 -; SI-NEXT: v_writelane_b32 v62, s46, 14 -; SI-NEXT: v_writelane_b32 v62, s47, 15 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 8 -; SI-NEXT: v_writelane_b32 v62, s46, 12 -; SI-NEXT: v_writelane_b32 v62, s47, 13 -; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 22 -; SI-NEXT: v_writelane_b32 v62, s47, 23 -; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 16 -; SI-NEXT: v_writelane_b32 v62, s46, 20 -; SI-NEXT: v_writelane_b32 v62, s47, 21 -; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 8 -; SI-NEXT: v_writelane_b32 v62, s46, 18 -; SI-NEXT: v_writelane_b32 v62, s47, 19 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 28 -; SI-NEXT: v_writelane_b32 v62, s47, 29 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 -; SI-NEXT: v_writelane_b32 v62, s46, 26 -; SI-NEXT: v_writelane_b32 v62, s47, 27 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8 -; SI-NEXT: v_writelane_b32 v62, s46, 24 -; SI-NEXT: v_writelane_b32 v62, s47, 25 -; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 34 -; SI-NEXT: v_writelane_b32 v62, s47, 35 -; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 16 -; SI-NEXT: v_writelane_b32 v62, s46, 32 -; SI-NEXT: v_writelane_b32 v62, s47, 33 -; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 8 -; SI-NEXT: v_writelane_b32 v62, s46, 30 -; SI-NEXT: v_writelane_b32 v62, s47, 31 -; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 40 -; SI-NEXT: v_writelane_b32 v62, s47, 41 -; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 16 -; SI-NEXT: v_writelane_b32 v62, s46, 38 -; SI-NEXT: v_writelane_b32 v62, s47, 39 -; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 8 -; SI-NEXT: v_writelane_b32 v62, s46, 36 -; SI-NEXT: v_writelane_b32 v62, s47, 37 -; SI-NEXT: s_lshr_b64 s[46:47], s[20:21], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 44 -; SI-NEXT: v_writelane_b32 v62, s47, 45 -; SI-NEXT: s_lshr_b64 s[46:47], s[20:21], 16 -; SI-NEXT: v_writelane_b32 v62, s46, 42 -; SI-NEXT: v_writelane_b32 v62, s47, 43 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 48 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 49 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 8 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 46 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 47 -; SI-NEXT: s_lshr_b32 s71, s45, 8 -; SI-NEXT: v_bfe_u32 v0, v7, 8, 8 -; SI-NEXT: .LBB95_5: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 vcc_lo, v62, 0 -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 1 -; SI-NEXT: s_lshl_b32 s47, vcc_lo, 8 -; SI-NEXT: v_readlane_b32 vcc_lo, v62, 2 -; SI-NEXT: s_and_b32 s44, s44, 0xff -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 3 -; SI-NEXT: s_or_b32 s44, s44, s47 -; SI-NEXT: s_and_b32 s47, vcc_lo, 0xff -; SI-NEXT: v_readlane_b32 vcc_lo, v62, 4 -; SI-NEXT: s_lshl_b32 s57, vcc_lo, 24 -; SI-NEXT: s_lshl_b32 s47, s47, 16 -; SI-NEXT: s_or_b32 s47, s57, s47 -; SI-NEXT: s_and_b32 s44, s44, 0xffff -; SI-NEXT: s_or_b32 s44, s44, s47 -; SI-NEXT: v_mov_b32_e32 v13, s44 -; SI-NEXT: s_and_b32 s44, s45, 0xff -; SI-NEXT: s_lshl_b32 s45, s71, 8 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v47 -; SI-NEXT: s_or_b32 s44, s44, s45 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v35 -; SI-NEXT: s_and_b32 s44, s44, 0xffff -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_or_b32_e32 v14, s44, v14 -; SI-NEXT: v_readlane_b32 s44, v62, 6 -; SI-NEXT: v_readlane_b32 s45, v62, 7 -; SI-NEXT: s_lshl_b32 s44, s44, 8 -; SI-NEXT: s_and_b32 s42, s42, 0xff -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5 -; SI-NEXT: s_or_b32 s42, s42, s44 -; SI-NEXT: v_readlane_b32 s44, v62, 8 -; SI-NEXT: v_readlane_b32 s45, v62, 9 -; SI-NEXT: s_and_b32 s44, s44, 0xff -; SI-NEXT: v_readlane_b32 vcc_lo, v62, 10 -; SI-NEXT: s_lshl_b32 s45, vcc_lo, 24 -; SI-NEXT: s_lshl_b32 s44, s44, 16 -; SI-NEXT: s_or_b32 s44, s45, s44 -; SI-NEXT: s_and_b32 s42, s42, 0xffff -; SI-NEXT: s_or_b32 s42, s42, s44 -; SI-NEXT: v_mov_b32_e32 v15, s42 -; SI-NEXT: s_and_b32 s42, s43, 0xff -; SI-NEXT: s_lshl_b32 s43, s70, 8 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v50 -; SI-NEXT: s_or_b32 s42, s42, s43 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v17, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v32 -; SI-NEXT: s_and_b32 s42, s42, 0xffff -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_or_b32_e32 v18, s42, v18 -; SI-NEXT: v_readlane_b32 s42, v62, 12 -; SI-NEXT: v_readlane_b32 s43, v62, 13 -; SI-NEXT: s_lshl_b32 s42, s42, 8 -; SI-NEXT: s_and_b32 s40, s40, 0xff -; SI-NEXT: s_or_b32 s40, s40, s42 -; SI-NEXT: v_readlane_b32 s42, v62, 14 -; SI-NEXT: v_readlane_b32 s43, v62, 15 -; SI-NEXT: s_and_b32 s42, s42, 0xff -; SI-NEXT: v_readlane_b32 s44, v62, 16 -; SI-NEXT: s_lshl_b32 s43, s44, 24 -; SI-NEXT: s_lshl_b32 s42, s42, 16 -; SI-NEXT: s_or_b32 s42, s43, s42 -; SI-NEXT: s_and_b32 s40, s40, 0xffff -; SI-NEXT: s_or_b32 s40, s40, s42 -; SI-NEXT: v_mov_b32_e32 v20, s40 -; SI-NEXT: s_and_b32 s40, s41, 0xff -; SI-NEXT: s_lshl_b32 s41, s69, 8 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v44 -; SI-NEXT: s_or_b32 s40, s40, s41 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v31 -; SI-NEXT: s_and_b32 s40, s40, 0xffff -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_or_b32_e32 v22, s40, v22 -; SI-NEXT: v_readlane_b32 s40, v62, 18 -; SI-NEXT: v_readlane_b32 s41, v62, 19 -; SI-NEXT: s_lshl_b32 s40, s40, 8 -; SI-NEXT: s_and_b32 s28, s28, 0xff -; SI-NEXT: s_or_b32 s28, s28, s40 -; SI-NEXT: v_readlane_b32 s40, v62, 20 -; SI-NEXT: v_readlane_b32 s41, v62, 21 -; SI-NEXT: s_and_b32 s40, s40, 0xff -; SI-NEXT: v_readlane_b32 s42, v62, 22 -; SI-NEXT: s_lshl_b32 s41, s42, 24 -; SI-NEXT: s_lshl_b32 s40, s40, 16 -; SI-NEXT: s_or_b32 s40, s41, s40 -; SI-NEXT: s_and_b32 s28, s28, 0xffff -; SI-NEXT: s_or_b32 s28, s28, s40 -; SI-NEXT: v_mov_b32_e32 v23, s28 -; SI-NEXT: s_and_b32 s28, s29, 0xff -; SI-NEXT: s_lshl_b32 s29, s68, 8 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v54 -; SI-NEXT: s_or_b32 s28, s28, s29 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v29 -; SI-NEXT: s_and_b32 s28, s28, 0xffff -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_or_b32_e32 v26, s28, v26 -; SI-NEXT: v_readlane_b32 s28, v62, 24 -; SI-NEXT: v_readlane_b32 s29, v62, 25 -; SI-NEXT: s_lshl_b32 s28, s28, 8 -; SI-NEXT: s_and_b32 s26, s26, 0xff -; SI-NEXT: s_or_b32 s26, s26, s28 -; SI-NEXT: v_readlane_b32 s28, v62, 26 -; SI-NEXT: v_readlane_b32 s29, v62, 27 -; SI-NEXT: s_and_b32 s28, s28, 0xff -; SI-NEXT: v_readlane_b32 s40, v62, 28 -; SI-NEXT: s_lshl_b32 s29, s40, 24 +; SI-NEXT: v_readfirstlane_b32 s46, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s29 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_readfirstlane_b32 s29, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 ; SI-NEXT: s_lshl_b32 s28, s28, 16 -; SI-NEXT: s_or_b32 s28, s29, s28 -; SI-NEXT: s_and_b32 s26, s26, 0xffff -; SI-NEXT: s_or_b32 s26, s26, s28 -; SI-NEXT: v_mov_b32_e32 v28, s26 -; SI-NEXT: s_and_b32 s26, s27, 0xff -; SI-NEXT: s_lshl_b32 s27, s66, 8 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v52 -; SI-NEXT: s_or_b32 s26, s26, s27 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v49 -; SI-NEXT: s_and_b32 s26, s26, 0xffff -; SI-NEXT: v_or_b32_e32 v12, v27, v12 -; SI-NEXT: v_or_b32_e32 v12, s26, v12 -; SI-NEXT: v_readlane_b32 s26, v62, 30 -; SI-NEXT: v_readlane_b32 s27, v62, 31 -; SI-NEXT: s_lshl_b32 s26, s26, 8 -; SI-NEXT: s_and_b32 s24, s24, 0xff -; SI-NEXT: s_or_b32 s24, s24, s26 -; SI-NEXT: v_readlane_b32 s26, v62, 32 -; SI-NEXT: v_readlane_b32 s27, v62, 33 -; SI-NEXT: s_and_b32 s26, s26, 0xff -; SI-NEXT: v_readlane_b32 s28, v62, 34 -; SI-NEXT: s_lshl_b32 s27, s28, 24 +; SI-NEXT: v_readfirstlane_b32 s26, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s89 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 +; SI-NEXT: s_or_b32 s46, s46, s28 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_readfirstlane_b32 s28, v10 +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: v_readlane_b32 s27, v37, 4 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_or_b32 s47, s29, s28 +; SI-NEXT: v_readfirstlane_b32 s28, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 +; SI-NEXT: v_readfirstlane_b32 s27, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 11 -; SI-NEXT: s_or_b32 s26, s27, s26 -; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: s_or_b32 s24, s24, s26 -; SI-NEXT: buffer_store_dword v13, v24, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 4, v24 -; SI-NEXT: v_mov_b32_e32 v27, s24 -; SI-NEXT: s_and_b32 s24, s25, 0xff -; SI-NEXT: s_lshl_b32 s25, s64, 8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v2 -; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 8, v24 -; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 -; SI-NEXT: buffer_store_dword v15, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 12, v24 -; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: v_or_b32_e32 v11, v25, v11 -; SI-NEXT: buffer_store_dword v18, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 16, v24 -; SI-NEXT: v_or_b32_e32 v11, s24, v11 -; SI-NEXT: buffer_store_dword v20, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 20, v24 -; SI-NEXT: v_readlane_b32 s24, v62, 36 -; SI-NEXT: buffer_store_dword v22, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v24 -; SI-NEXT: v_readlane_b32 s25, v62, 37 -; SI-NEXT: s_lshl_b32 s24, s24, 8 -; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: buffer_store_dword v23, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 28, v24 -; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: v_readlane_b32 s24, v62, 38 -; SI-NEXT: buffer_store_dword v26, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v24 -; SI-NEXT: v_readlane_b32 s25, v62, 39 -; SI-NEXT: s_and_b32 s24, s24, 0xff -; SI-NEXT: v_readlane_b32 s26, v62, 40 -; SI-NEXT: buffer_store_dword v28, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 36, v24 +; SI-NEXT: s_or_b32 s58, s28, s26 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_readfirstlane_b32 s26, v11 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s59, s27, s26 +; SI-NEXT: v_readfirstlane_b32 s26, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_readlane_b32 s25, v37, 2 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_readfirstlane_b32 s24, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 +; SI-NEXT: v_readfirstlane_b32 s25, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_or_b32 s62, s26, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v13 ; SI-NEXT: s_lshl_b32 s24, s24, 16 -; SI-NEXT: s_lshl_b32 s25, s26, 24 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_or_b32 s63, s25, s24 +; SI-NEXT: v_readfirstlane_b32 s24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 +; SI-NEXT: v_readfirstlane_b32 s22, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s79 +; SI-NEXT: v_readlane_b32 s23, v37, 0 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_readlane_b32 s18, v37, 21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_readfirstlane_b32 s23, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_or_b32 s56, s24, s22 +; SI-NEXT: v_readfirstlane_b32 s22, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readlane_b32 s18, v37, 20 +; SI-NEXT: s_or_b32 s57, s23, s22 +; SI-NEXT: v_readfirstlane_b32 s22, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: v_readfirstlane_b32 s20, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s78 +; SI-NEXT: v_readlane_b32 s18, v37, 18 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s99 +; SI-NEXT: v_readfirstlane_b32 s21, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_or_b32 s60, s22, s20 +; SI-NEXT: v_readfirstlane_b32 s20, v20 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: v_readfirstlane_b32 s18, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s77 +; SI-NEXT: s_or_b32 s61, s21, s20 +; SI-NEXT: v_readfirstlane_b32 s20, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_or_b32 s72, s20, s18 +; SI-NEXT: v_readfirstlane_b32 s18, v23 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 +; SI-NEXT: v_readfirstlane_b32 s16, v12 +; SI-NEXT: s_or_b32 s73, s16, s18 +; SI-NEXT: s_lshr_b64 s[18:19], s[72:73], 16 +; SI-NEXT: v_writelane_b32 v37, s18, 31 +; SI-NEXT: v_writelane_b32 v37, s19, 32 +; SI-NEXT: s_lshr_b64 s[18:19], s[46:47], 24 +; SI-NEXT: v_writelane_b32 v37, s18, 63 +; SI-NEXT: v_writelane_b32 v36, s19, 0 +; SI-NEXT: s_lshr_b64 s[18:19], s[46:47], 16 +; SI-NEXT: v_writelane_b32 v37, s18, 61 +; SI-NEXT: v_writelane_b32 v37, s19, 62 +; SI-NEXT: s_lshr_b64 s[18:19], s[46:47], 8 +; SI-NEXT: v_writelane_b32 v37, s18, 59 +; SI-NEXT: v_writelane_b32 v37, s19, 60 +; SI-NEXT: s_lshr_b64 s[18:19], s[14:15], 24 +; SI-NEXT: v_writelane_b32 v36, s18, 23 +; SI-NEXT: v_writelane_b32 v36, s19, 24 +; SI-NEXT: s_lshr_b64 s[18:19], s[12:13], 24 +; SI-NEXT: v_writelane_b32 v36, s18, 29 +; SI-NEXT: v_writelane_b32 v36, s19, 30 +; SI-NEXT: s_lshr_b64 s[18:19], s[12:13], 16 +; SI-NEXT: v_writelane_b32 v36, s18, 27 +; SI-NEXT: v_writelane_b32 v36, s19, 28 +; SI-NEXT: s_lshr_b64 s[18:19], s[12:13], 8 +; SI-NEXT: v_writelane_b32 v36, s18, 25 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s97 +; SI-NEXT: v_writelane_b32 v36, s19, 26 +; SI-NEXT: s_lshr_b64 s[70:71], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v36, s70, 33 +; SI-NEXT: v_writelane_b32 v36, s71, 34 +; SI-NEXT: s_lshr_b64 s[70:71], s[10:11], 8 +; SI-NEXT: v_writelane_b32 v36, s70, 31 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_writelane_b32 v36, s71, 32 +; SI-NEXT: s_lshr_b64 s[70:71], s[8:9], 24 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_writelane_b32 v36, s70, 41 +; SI-NEXT: v_writelane_b32 v36, s71, 42 +; SI-NEXT: s_lshr_b64 s[70:71], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v36, s70, 39 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v15 +; SI-NEXT: v_writelane_b32 v36, s71, 40 +; SI-NEXT: s_lshr_b64 s[70:71], s[8:9], 8 +; SI-NEXT: v_readfirstlane_b32 s16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s76 +; SI-NEXT: v_writelane_b32 v36, s70, 37 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 +; SI-NEXT: v_writelane_b32 v36, s71, 38 +; SI-NEXT: s_lshr_b64 s[70:71], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v37, s70, 6 +; SI-NEXT: v_writelane_b32 v37, s71, 7 +; SI-NEXT: s_lshr_b64 s[70:71], s[6:7], 16 +; SI-NEXT: v_readfirstlane_b32 s17, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v14 +; SI-NEXT: v_writelane_b32 v37, s70, 8 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v15 +; SI-NEXT: v_writelane_b32 v37, s71, 9 +; SI-NEXT: s_lshr_b64 s[70:71], s[6:7], 8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_writelane_b32 v36, s70, 43 +; SI-NEXT: v_writelane_b32 v36, s71, 44 +; SI-NEXT: s_lshr_b64 s[70:71], s[4:5], 24 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_writelane_b32 v37, s70, 0 +; SI-NEXT: s_or_b32 s74, s17, s16 +; SI-NEXT: v_readfirstlane_b32 s16, v26 +; SI-NEXT: v_writelane_b32 v37, s71, 1 +; SI-NEXT: s_lshr_b64 s[70:71], s[4:5], 16 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_readfirstlane_b32 s17, v12 +; SI-NEXT: v_writelane_b32 v37, s70, 4 +; SI-NEXT: s_or_b32 s75, s17, s16 +; SI-NEXT: v_writelane_b32 v37, s71, 5 +; SI-NEXT: s_lshr_b64 s[70:71], s[4:5], 8 +; SI-NEXT: s_lshr_b64 vcc, s[74:75], 24 +; SI-NEXT: s_lshr_b64 s[38:39], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[72:73], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[72:73], 8 +; SI-NEXT: s_lshr_b64 s[76:77], s[60:61], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[60:61], 8 +; SI-NEXT: s_lshr_b64 s[24:25], s[56:57], 8 +; SI-NEXT: s_lshr_b64 s[50:51], s[62:63], 24 +; SI-NEXT: s_lshr_b64 s[20:21], s[58:59], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[58:59], 8 +; SI-NEXT: s_lshr_b64 s[18:19], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v37, s70, 2 +; SI-NEXT: s_lshr_b64 s[16:17], s[74:75], 8 +; SI-NEXT: s_lshr_b64 s[82:83], s[56:57], 24 +; SI-NEXT: s_lshr_b64 s[86:87], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[68:69], s[62:63], 8 +; SI-NEXT: s_lshr_b32 s21, s75, 8 +; SI-NEXT: s_lshr_b32 s95, s73, 8 +; SI-NEXT: s_lshr_b32 s91, s61, 8 +; SI-NEXT: s_lshr_b32 s79, s57, 8 +; SI-NEXT: s_lshr_b32 s77, s63, 8 +; SI-NEXT: s_lshr_b32 vcc_hi, s59, 8 +; SI-NEXT: s_lshr_b32 s51, s47, 8 +; SI-NEXT: s_lshr_b32 s39, s45, 8 +; SI-NEXT: s_lshr_b32 s37, s43, 8 +; SI-NEXT: s_lshr_b32 s35, s41, 8 +; SI-NEXT: s_lshr_b32 s29, s15, 8 +; SI-NEXT: s_lshr_b32 s28, s13, 8 +; SI-NEXT: s_lshr_b32 s27, s11, 8 +; SI-NEXT: s_lshr_b32 s26, s9, 8 +; SI-NEXT: s_lshr_b32 s93, s7, 8 +; SI-NEXT: s_lshr_b32 s25, s5, 8 +; SI-NEXT: v_bfe_u32 v32, v26, 8, 8 +; SI-NEXT: v_bfe_u32 v31, v23, 8, 8 +; SI-NEXT: v_bfe_u32 v30, v20, 8, 8 +; SI-NEXT: v_bfe_u32 v29, v16, 8, 8 +; SI-NEXT: v_bfe_u32 v28, v13, 8, 8 +; SI-NEXT: v_bfe_u32 v27, v11, 8, 8 +; SI-NEXT: v_bfe_u32 v25, v10, 8, 8 +; SI-NEXT: v_bfe_u32 v24, v9, 8, 8 +; SI-NEXT: v_bfe_u32 v22, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v21, v7, 8, 8 +; SI-NEXT: v_bfe_u32 v19, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v18, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v17, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v14, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v12, v1, 8, 8 +; SI-NEXT: s_lshr_b64 s[66:67], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[54:55], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[84:85], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[96:97], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[98:99], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[22:23], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[64:65], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[80:81], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[14:15], 8 +; SI-NEXT: v_writelane_b32 v37, s71, 3 +; SI-NEXT: s_mov_b32 s70, s18 +; SI-NEXT: s_branch .LBB95_5 +; SI-NEXT: .LBB95_3: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_mov_b32 s30, s53 +; SI-NEXT: v_writelane_b32 v37, s4, 23 +; SI-NEXT: v_writelane_b32 v37, s5, 24 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_mov_b32 s53, s50 +; SI-NEXT: v_writelane_b32 v37, s4, 25 +; SI-NEXT: v_writelane_b32 v37, s5, 26 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_mov_b32 s20, s37 +; SI-NEXT: v_writelane_b32 v37, s4, 27 +; SI-NEXT: v_writelane_b32 v37, s5, 28 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_mov_b32 s37, s49 +; SI-NEXT: v_writelane_b32 v37, s4, 29 +; SI-NEXT: v_writelane_b32 v37, s5, 30 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 31 +; SI-NEXT: v_writelane_b32 v37, s5, 32 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr85 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr87 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: v_writelane_b32 v37, s4, 33 +; SI-NEXT: v_writelane_b32 v37, s5, 34 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 35 +; SI-NEXT: v_writelane_b32 v37, s5, 36 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 37 +; SI-NEXT: v_writelane_b32 v37, s5, 38 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 39 +; SI-NEXT: v_writelane_b32 v37, s5, 40 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 41 +; SI-NEXT: v_writelane_b32 v37, s5, 42 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 43 +; SI-NEXT: v_writelane_b32 v37, s5, 44 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 45 +; SI-NEXT: v_writelane_b32 v37, s5, 46 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 47 +; SI-NEXT: v_writelane_b32 v37, s5, 48 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 49 +; SI-NEXT: v_writelane_b32 v37, s5, 50 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 51 +; SI-NEXT: v_writelane_b32 v37, s5, 52 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 53 +; SI-NEXT: v_writelane_b32 v37, s5, 54 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 55 +; SI-NEXT: v_writelane_b32 v37, s5, 56 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 57 +; SI-NEXT: v_writelane_b32 v37, s5, 58 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 59 +; SI-NEXT: v_writelane_b32 v37, s5, 60 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 61 +; SI-NEXT: v_writelane_b32 v37, s5, 62 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v37, s4, 63 +; SI-NEXT: v_writelane_b32 v36, s5, 0 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 1 +; SI-NEXT: v_writelane_b32 v36, s5, 2 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 3 +; SI-NEXT: v_writelane_b32 v36, s5, 4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 5 +; SI-NEXT: v_writelane_b32 v36, s5, 6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 7 +; SI-NEXT: v_writelane_b32 v36, s5, 8 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 9 +; SI-NEXT: v_writelane_b32 v36, s5, 10 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 11 +; SI-NEXT: v_writelane_b32 v36, s5, 12 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 13 +; SI-NEXT: v_writelane_b32 v36, s5, 14 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 15 +; SI-NEXT: v_writelane_b32 v36, s5, 16 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 17 +; SI-NEXT: v_writelane_b32 v36, s5, 18 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 19 +; SI-NEXT: v_writelane_b32 v36, s5, 20 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 21 +; SI-NEXT: v_writelane_b32 v36, s5, 22 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 23 +; SI-NEXT: v_writelane_b32 v36, s5, 24 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 25 +; SI-NEXT: v_writelane_b32 v36, s5, 26 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 27 +; SI-NEXT: v_writelane_b32 v36, s5, 28 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 29 +; SI-NEXT: v_writelane_b32 v36, s5, 30 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 31 +; SI-NEXT: v_writelane_b32 v36, s5, 32 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 33 +; SI-NEXT: v_writelane_b32 v36, s5, 34 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 35 +; SI-NEXT: v_writelane_b32 v36, s5, 36 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 37 +; SI-NEXT: v_writelane_b32 v36, s5, 38 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 39 +; SI-NEXT: v_writelane_b32 v36, s5, 40 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 41 +; SI-NEXT: v_writelane_b32 v36, s5, 42 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v36, s4, 43 +; SI-NEXT: v_writelane_b32 v36, s5, 44 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_branch .LBB95_2 +; SI-NEXT: .LBB95_4: +; SI-NEXT: v_mov_b32_e32 v16, s79 +; SI-NEXT: v_mov_b32_e32 v20, s78 +; SI-NEXT: v_mov_b32_e32 v23, s77 +; SI-NEXT: v_mov_b32_e32 v26, s76 +; SI-NEXT: v_readlane_b32 s76, v37, 39 +; SI-NEXT: v_readlane_b32 s78, v37, 37 +; SI-NEXT: v_readlane_b32 s77, v37, 40 +; SI-NEXT: v_readlane_b32 s79, v37, 38 +; SI-NEXT: v_writelane_b32 v37, s34, 0 +; SI-NEXT: v_writelane_b32 v37, s35, 1 +; SI-NEXT: v_mov_b32_e32 v9, s91 +; SI-NEXT: v_mov_b32_e32 v10, s90 +; SI-NEXT: v_readlane_b32 s16, v36, 45 +; SI-NEXT: v_readlane_b32 s90, v37, 33 +; SI-NEXT: v_mov_b32_e32 v25, s16 +; SI-NEXT: v_readlane_b32 s16, v36, 46 +; SI-NEXT: v_readlane_b32 s91, v37, 34 +; SI-NEXT: v_writelane_b32 v37, s70, 4 +; SI-NEXT: v_mov_b32_e32 v27, s16 +; SI-NEXT: v_readlane_b32 s16, v36, 47 +; SI-NEXT: v_writelane_b32 v37, s71, 5 +; SI-NEXT: v_mov_b32_e32 v1, s95 +; SI-NEXT: v_mov_b32_e32 v28, s16 +; SI-NEXT: v_readlane_b32 s16, v36, 48 +; SI-NEXT: v_readlane_b32 s94, v37, 35 +; SI-NEXT: v_mov_b32_e32 v29, s16 +; SI-NEXT: v_readlane_b32 s16, v36, 49 +; SI-NEXT: v_readlane_b32 s95, v37, 36 +; SI-NEXT: v_writelane_b32 v37, s38, 2 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: v_readlane_b32 s16, v36, 60 +; SI-NEXT: v_writelane_b32 v37, s39, 3 +; SI-NEXT: s_mov_b32 vcc_hi, s16 +; SI-NEXT: v_readlane_b32 s16, v37, 27 +; SI-NEXT: v_readlane_b32 s34, v37, 29 +; SI-NEXT: v_readlane_b32 s17, v37, 28 +; SI-NEXT: v_readlane_b32 s35, v37, 30 +; SI-NEXT: v_writelane_b32 v37, s50, 6 +; SI-NEXT: v_writelane_b32 v37, s51, 7 +; SI-NEXT: v_readlane_b32 s38, v37, 25 +; SI-NEXT: v_readlane_b32 s39, v37, 26 +; SI-NEXT: v_writelane_b32 v37, s80, 8 +; SI-NEXT: v_writelane_b32 v37, s81, 9 +; SI-NEXT: v_mov_b32_e32 v6, s36 +; SI-NEXT: v_mov_b32_e32 v31, s21 +; SI-NEXT: v_readlane_b32 s20, v36, 56 +; SI-NEXT: v_readlane_b32 s21, v36, 58 +; SI-NEXT: v_readlane_b32 s80, v36, 35 +; SI-NEXT: v_readlane_b32 s36, v37, 53 +; SI-NEXT: v_readlane_b32 s50, v37, 51 +; SI-NEXT: v_mov_b32_e32 v8, s92 +; SI-NEXT: v_mov_b32_e32 v24, s93 +; SI-NEXT: v_readlane_b32 s22, v36, 57 +; SI-NEXT: v_readlane_b32 s23, v36, 59 +; SI-NEXT: s_mov_b32 s35, s20 +; SI-NEXT: s_mov_b32 s39, s21 +; SI-NEXT: v_readlane_b32 s81, v36, 36 +; SI-NEXT: v_readlane_b32 s20, v37, 57 +; SI-NEXT: v_readlane_b32 s92, v37, 55 +; SI-NEXT: v_readlane_b32 s37, v37, 54 +; SI-NEXT: v_readlane_b32 s51, v37, 52 +; SI-NEXT: v_readlane_b32 s24, v37, 41 +; SI-NEXT: v_mov_b32_e32 v2, s66 +; SI-NEXT: v_mov_b32_e32 v3, s55 +; SI-NEXT: v_mov_b32_e32 v4, s52 +; SI-NEXT: v_mov_b32_e32 v5, s48 +; SI-NEXT: v_mov_b32_e32 v7, s31 +; SI-NEXT: v_mov_b32_e32 v11, s89 +; SI-NEXT: v_mov_b32_e32 v13, s88 +; SI-NEXT: v_mov_b32_e32 v12, s87 +; SI-NEXT: v_mov_b32_e32 v14, s86 +; SI-NEXT: v_mov_b32_e32 v15, s69 +; SI-NEXT: v_mov_b32_e32 v17, s85 +; SI-NEXT: v_mov_b32_e32 v18, s84 +; SI-NEXT: v_mov_b32_e32 v19, s83 +; SI-NEXT: v_mov_b32_e32 v21, s82 +; SI-NEXT: v_mov_b32_e32 v22, s49 +; SI-NEXT: v_mov_b32_e32 v32, s18 +; SI-NEXT: v_readlane_b32 s19, v36, 50 +; SI-NEXT: v_readlane_b32 s28, v36, 51 +; SI-NEXT: v_readlane_b32 s29, v36, 54 +; SI-NEXT: v_readlane_b32 s18, v36, 55 +; SI-NEXT: s_mov_b32 vcc_lo, s16 +; SI-NEXT: v_readlane_b32 s16, v37, 23 +; SI-NEXT: s_mov_b32 s70, s80 +; SI-NEXT: v_readlane_b32 s21, v37, 58 +; SI-NEXT: v_readlane_b32 s93, v37, 56 +; SI-NEXT: v_readlane_b32 s30, v36, 21 +; SI-NEXT: s_mov_b32 s37, s22 +; SI-NEXT: v_readlane_b32 s48, v36, 19 +; SI-NEXT: s_mov_b32 s51, s23 +; SI-NEXT: v_readlane_b32 s22, v36, 17 +; SI-NEXT: v_readlane_b32 s52, v37, 49 +; SI-NEXT: v_readlane_b32 s64, v36, 15 +; SI-NEXT: v_readlane_b32 s68, v37, 47 +; SI-NEXT: v_readlane_b32 s80, v36, 13 +; SI-NEXT: v_readlane_b32 s82, v37, 45 +; SI-NEXT: v_readlane_b32 s84, v36, 11 +; SI-NEXT: v_readlane_b32 s86, v37, 43 +; SI-NEXT: v_readlane_b32 s96, v36, 9 +; SI-NEXT: v_readlane_b32 s25, v37, 42 +; SI-NEXT: v_readlane_b32 s98, v36, 7 +; SI-NEXT: v_readlane_b32 s54, v36, 3 +; SI-NEXT: v_readlane_b32 s66, v36, 5 +; SI-NEXT: v_readlane_b32 s88, v36, 1 +; SI-NEXT: v_readlane_b32 s26, v36, 52 +; SI-NEXT: v_readlane_b32 s27, v36, 53 +; SI-NEXT: v_readlane_b32 s77, v36, 61 +; SI-NEXT: v_readlane_b32 s17, v37, 24 +; SI-NEXT: v_readlane_b32 s79, v36, 62 +; SI-NEXT: v_readlane_b32 s91, v36, 63 +; SI-NEXT: v_readlane_b32 s95, v35, 0 +; SI-NEXT: v_readlane_b32 s31, v36, 22 +; SI-NEXT: v_readlane_b32 s21, v35, 1 +; SI-NEXT: v_readlane_b32 s49, v36, 20 +; SI-NEXT: v_readlane_b32 s23, v36, 18 +; SI-NEXT: v_readlane_b32 s53, v37, 50 +; SI-NEXT: v_readlane_b32 s65, v36, 16 +; SI-NEXT: v_readlane_b32 s69, v37, 48 +; SI-NEXT: v_readlane_b32 s81, v36, 14 +; SI-NEXT: v_readlane_b32 s83, v37, 46 +; SI-NEXT: v_readlane_b32 s85, v36, 12 +; SI-NEXT: v_readlane_b32 s87, v37, 44 +; SI-NEXT: v_readlane_b32 s97, v36, 10 +; SI-NEXT: s_mov_b32 s25, s19 +; SI-NEXT: s_mov_b32 s93, s28 +; SI-NEXT: s_mov_b32 s28, s29 +; SI-NEXT: s_mov_b32 s29, s18 +; SI-NEXT: v_readlane_b32 s99, v36, 8 +; SI-NEXT: v_readlane_b32 s55, v36, 4 +; SI-NEXT: v_readlane_b32 s67, v36, 6 +; SI-NEXT: v_readlane_b32 s89, v36, 2 +; SI-NEXT: .LBB95_5: ; %end +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: s_and_b32 s17, s74, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s38, 0xff +; SI-NEXT: s_lshl_b32 s18, vcc_lo, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v33, s16 +; SI-NEXT: s_and_b32 s16, s75, 0xff +; SI-NEXT: s_lshl_b32 s17, s21, 8 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v32 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_or_b32_e32 v26, v32, v26 +; SI-NEXT: v_or_b32_e32 v26, s16, v26 +; SI-NEXT: s_lshl_b32 s16, s34, 8 +; SI-NEXT: s_and_b32 s17, s72, 0xff +; SI-NEXT: v_readlane_b32 s18, v37, 31 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: s_lshl_b32 s18, s90, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v32, s16 +; SI-NEXT: s_and_b32 s16, s73, 0xff +; SI-NEXT: s_lshl_b32 s17, s95, 8 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v31 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: v_or_b32_e32 v23, s16, v23 +; SI-NEXT: s_lshl_b32 s16, s94, 8 +; SI-NEXT: s_and_b32 s17, s60, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s78, 0xff +; SI-NEXT: s_lshl_b32 s18, s76, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v31, s16 +; SI-NEXT: s_and_b32 s16, s61, 0xff +; SI-NEXT: s_lshl_b32 s17, s91, 8 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v30 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_or_b32_e32 v20, v30, v20 +; SI-NEXT: v_or_b32_e32 v20, s16, v20 +; SI-NEXT: s_lshl_b32 s16, s24, 8 +; SI-NEXT: s_and_b32 s17, s56, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s86, 0xff +; SI-NEXT: s_lshl_b32 s18, s82, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: s_and_b32 s16, s57, 0xff +; SI-NEXT: s_lshl_b32 s17, s79, 8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_or_b32_e32 v16, v29, v16 +; SI-NEXT: v_or_b32_e32 v16, s16, v16 +; SI-NEXT: s_lshl_b32 s16, s68, 8 +; SI-NEXT: s_and_b32 s17, s62, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s52, 0xff +; SI-NEXT: s_lshl_b32 s18, s50, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v29, s16 +; SI-NEXT: s_and_b32 s16, s63, 0xff +; SI-NEXT: s_lshl_b32 s17, s77, 8 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v28 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_or_b32_e32 v13, v28, v13 +; SI-NEXT: v_or_b32_e32 v13, s16, v13 +; SI-NEXT: s_lshl_b32 s16, s36, 8 +; SI-NEXT: s_and_b32 s17, s58, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s92, 0xff +; SI-NEXT: s_lshl_b32 s18, s20, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_lshl_b32 s17, vcc_hi, 8 +; SI-NEXT: v_or_b32_e32 v11, v27, v11 +; SI-NEXT: v_add_i32_e32 v27, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v28, s16 +; SI-NEXT: s_and_b32 s16, s59, 0xff +; SI-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 40, v24 -; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: buffer_store_dword v27, v12, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v24 -; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v26, vcc, 8, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v32, v26, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v26, vcc, 12, v0 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 16, v0 +; SI-NEXT: v_readlane_b32 s19, v37, 32 +; SI-NEXT: v_or_b32_e32 v11, s16, v11 +; SI-NEXT: buffer_store_dword v31, v23, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v23, vcc, 20, v0 +; SI-NEXT: v_readlane_b32 s16, v37, 59 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 24, v0 +; SI-NEXT: v_readlane_b32 s17, v37, 60 +; SI-NEXT: v_readlane_b32 s18, v37, 61 +; SI-NEXT: buffer_store_dword v30, v20, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: s_and_b32 s17, s46, 0xff +; SI-NEXT: v_readlane_b32 s19, v37, 62 +; SI-NEXT: buffer_store_dword v16, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v24 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: s_and_b32 s22, s23, 0xff -; SI-NEXT: s_lshl_b32 s23, s54, 8 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v8 -; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: v_add_i32_e32 v16, vcc, 32, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v37, 63 +; SI-NEXT: buffer_store_dword v29, v16, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 40, v0 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: buffer_store_dword v28, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: s_and_b32 s16, s47, 0xff +; SI-NEXT: s_lshl_b32 s17, s51, 8 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: buffer_store_dword v13, v11, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v21 -; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v25 +; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_or_b32_e32 v10, s22, v10 -; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s22, s62, 8 -; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: v_readlane_b32 s22, v62, 42 -; SI-NEXT: v_readlane_b32 s23, v62, 43 -; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: v_readlane_b32 s24, v62, 44 -; SI-NEXT: s_lshl_b32 s22, s22, 16 -; SI-NEXT: s_lshl_b32 s23, s24, 24 -; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_or_b32 s22, s23, s22 -; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v24 -; SI-NEXT: s_or_b32 s20, s20, s22 +; SI-NEXT: v_or_b32_e32 v10, s16, v10 +; SI-NEXT: s_and_b32 s16, s44, 0xff +; SI-NEXT: s_lshl_b32 s17, s88, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s54, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s66, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v24 -; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: s_and_b32 s16, s45, 0xff +; SI-NEXT: s_lshl_b32 s17, s39, 8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_and_b32 s20, s21, 0xff -; SI-NEXT: s_lshl_b32 s21, s52, 8 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v16 -; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s21, s96, 24 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x44, v24 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v19 -; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v40 -; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v60 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v41 -; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v42 -; SI-NEXT: v_readlane_b32 s45, v62, 17 -; SI-NEXT: v_readlane_b32 s43, v62, 23 -; SI-NEXT: v_readlane_b32 s41, v62, 29 -; SI-NEXT: v_readlane_b32 s29, v62, 35 -; SI-NEXT: v_readlane_b32 s27, v62, 41 -; SI-NEXT: v_readlane_b32 s25, v62, 45 -; SI-NEXT: v_readlane_b32 s99, v63, 35 -; SI-NEXT: v_readlane_b32 s97, v63, 33 -; SI-NEXT: v_readlane_b32 s96, v63, 32 -; SI-NEXT: v_readlane_b32 s87, v63, 31 -; SI-NEXT: v_readlane_b32 s85, v63, 29 -; SI-NEXT: v_readlane_b32 s83, v63, 27 -; SI-NEXT: v_readlane_b32 s81, v63, 25 -; SI-NEXT: v_readlane_b32 s71, v63, 23 -; SI-NEXT: v_readlane_b32 s70, v63, 22 -; SI-NEXT: v_readlane_b32 s69, v63, 21 -; SI-NEXT: v_readlane_b32 s68, v63, 20 -; SI-NEXT: v_readlane_b32 s66, v63, 18 -; SI-NEXT: v_readlane_b32 s64, v63, 16 -; SI-NEXT: v_readlane_b32 s54, v63, 14 -; SI-NEXT: v_readlane_b32 s52, v63, 12 -; SI-NEXT: v_readlane_b32 s39, v63, 7 -; SI-NEXT: v_readlane_b32 s37, v63, 5 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v2 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_or_b32_e32 v9, s20, v9 -; SI-NEXT: s_lshl_b32 s20, s58, 8 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: s_and_b32 s20, s98, 0xff -; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v24 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xff -; SI-NEXT: s_lshl_b32 s19, s50, 8 -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: v_or_b32_e32 v0, s18, v0 -; SI-NEXT: s_lshl_b32 s18, s38, 8 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: s_and_b32 s18, s36, 0xff -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s34, 24 +; SI-NEXT: v_or_b32_e32 v9, s16, v9 +; SI-NEXT: s_and_b32 s16, s42, 0xff +; SI-NEXT: s_lshl_b32 s17, s98, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s96, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s84, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v24 -; SI-NEXT: s_or_b32 s16, s16, s18 +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: s_and_b32 s16, s43, 0xff +; SI-NEXT: s_lshl_b32 s17, s37, 8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v22 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v8, s16, v8 +; SI-NEXT: s_and_b32 s16, s40, 0xff +; SI-NEXT: s_lshl_b32 s17, s80, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s64, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s22, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x48, v24 -; SI-NEXT: v_mov_b32_e32 v8, s16 -; SI-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen -; SI-NEXT: s_and_b32 s16, s17, 0xff -; SI-NEXT: s_lshl_b32 s17, s48, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: s_and_b32 s16, s41, 0xff +; SI-NEXT: s_lshl_b32 s17, s35, 8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_or_b32_e32 v0, s16, v0 -; SI-NEXT: s_lshl_b32 s16, s30, 8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_readlane_b32 s19, v36, 0 +; SI-NEXT: v_or_b32_e32 v7, s16, v7 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: s_lshl_b32 s16, s48, 8 ; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: s_and_b32 s16, s94, 0xff +; SI-NEXT: s_and_b32 s16, s30, 0xff +; SI-NEXT: v_readlane_b32 s18, v36, 23 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s92, 24 +; SI-NEXT: s_lshl_b32 s17, s18, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v24 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 ; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v0, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x50, v24 -; SI-NEXT: v_mov_b32_e32 v7, s14 -; SI-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; SI-NEXT: v_mov_b32_e32 v8, s14 ; SI-NEXT: s_and_b32 s14, s15, 0xff -; SI-NEXT: s_lshl_b32 s15, s67, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: s_lshl_b32 s15, s29, 8 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v19 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v6, v0 -; SI-NEXT: v_or_b32_e32 v0, s14, v0 -; SI-NEXT: s_lshl_b32 s14, s90, 8 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v6, s14, v6 +; SI-NEXT: v_readlane_b32 s14, v36, 25 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: v_readlane_b32 s15, v36, 26 +; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: s_and_b32 s14, s88, 0xff +; SI-NEXT: v_readlane_b32 s14, v36, 27 +; SI-NEXT: v_readlane_b32 s15, v36, 28 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: v_readlane_b32 s16, v36, 29 ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s15, s78, 24 +; SI-NEXT: s_lshl_b32 s15, s16, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v24 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x58, v24 -; SI-NEXT: v_mov_b32_e32 v6, s12 -; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: v_mov_b32_e32 v7, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xff -; SI-NEXT: s_lshl_b32 s13, s65, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; SI-NEXT: s_lshl_b32 s13, s28, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v61 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v18 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_or_b32_e32 v0, s12, v0 -; SI-NEXT: s_lshl_b32 s12, s76, 8 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v5, s12, v5 +; SI-NEXT: v_readlane_b32 s12, v36, 31 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: v_readlane_b32 s13, v36, 32 +; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: s_and_b32 s12, s74, 0xff +; SI-NEXT: v_readlane_b32 s12, v36, 33 +; SI-NEXT: v_readlane_b32 s13, v36, 34 +; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s13, s72, 24 +; SI-NEXT: s_lshl_b32 s13, s70, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v24 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x60, v24 -; SI-NEXT: v_mov_b32_e32 v5, s10 -; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_and_b32 s10, s11, 0xff -; SI-NEXT: s_lshl_b32 s11, s55, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 +; SI-NEXT: s_lshl_b32 s11, s27, 8 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v17 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_or_b32_e32 v0, s10, v0 -; SI-NEXT: s_lshl_b32 s10, s60, 8 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v4, s10, v4 +; SI-NEXT: v_readlane_b32 s10, v36, 37 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s11, v36, 38 +; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: s_and_b32 s10, s56, 0xff +; SI-NEXT: v_readlane_b32 s10, v36, 39 +; SI-NEXT: v_readlane_b32 s11, v36, 40 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: v_readlane_b32 s12, v36, 41 ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s11, s46, 24 +; SI-NEXT: s_lshl_b32 s11, s12, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v24 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x68, v24 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: v_mov_b32_e32 v5, s8 ; SI-NEXT: s_and_b32 s8, s9, 0xff -; SI-NEXT: s_lshl_b32 s9, s53, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: s_lshl_b32 s9, s26, 8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v15 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_or_b32_e32 v0, s8, v0 -; SI-NEXT: s_lshl_b32 s8, s86, 8 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v3, s8, v3 +; SI-NEXT: v_readlane_b32 s8, v36, 43 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: v_readlane_b32 s9, v36, 44 +; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: s_and_b32 s8, s84, 0xff +; SI-NEXT: v_readlane_b32 s8, v37, 8 +; SI-NEXT: v_readlane_b32 s9, v37, 9 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s10, v37, 6 ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s82, 24 +; SI-NEXT: s_lshl_b32 s9, s10, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v24 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x70, v24 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_mov_b32_e32 v4, s6 ; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: s_lshl_b32 s7, s51, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: s_lshl_b32 s7, s93, 8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v14 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_or_b32_e32 v0, s6, v0 -; SI-NEXT: v_readlane_b32 s6, v62, 46 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v2, s6, v2 +; SI-NEXT: v_readlane_b32 s6, v37, 2 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_readlane_b32 s7, v37, 3 ; SI-NEXT: s_lshl_b32 s6, s6, 8 -; SI-NEXT: v_readlane_b32 s7, v62, 47 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s80, 0xff -; SI-NEXT: v_readlane_b32 s8, v62, 48 +; SI-NEXT: v_readlane_b32 s6, v37, 4 +; SI-NEXT: v_readlane_b32 s7, v37, 5 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: v_readlane_b32 s8, v37, 0 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s7, s8, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v24 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x78, v24 -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: s_and_b32 s4, s5, 0xff -; SI-NEXT: s_lshl_b32 s5, s49, 8 +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v12 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_readlane_b32 s9, v62, 49 -; SI-NEXT: v_readlane_b32 s98, v63, 34 -; SI-NEXT: v_readlane_b32 s86, v63, 30 -; SI-NEXT: v_readlane_b32 s84, v63, 28 -; SI-NEXT: v_readlane_b32 s82, v63, 26 -; SI-NEXT: v_readlane_b32 s80, v63, 24 -; SI-NEXT: v_readlane_b32 s67, v63, 19 -; SI-NEXT: v_readlane_b32 s65, v63, 17 -; SI-NEXT: v_readlane_b32 s55, v63, 15 -; SI-NEXT: v_readlane_b32 s53, v63, 13 -; SI-NEXT: v_readlane_b32 s51, v63, 11 -; SI-NEXT: v_readlane_b32 s50, v63, 10 -; SI-NEXT: v_readlane_b32 s49, v63, 9 -; SI-NEXT: v_readlane_b32 s48, v63, 8 -; SI-NEXT: v_readlane_b32 s38, v63, 6 -; SI-NEXT: v_readlane_b32 s36, v63, 4 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v0, s4, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v24 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_readlane_b32 s19, v36, 24 +; SI-NEXT: v_readlane_b32 s17, v36, 30 +; SI-NEXT: v_readlane_b32 s13, v36, 42 +; SI-NEXT: v_readlane_b32 s11, v37, 7 +; SI-NEXT: v_readlane_b32 s9, v37, 1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s99, v34, 35 +; SI-NEXT: v_readlane_b32 s98, v34, 34 +; SI-NEXT: v_readlane_b32 s97, v34, 33 +; SI-NEXT: v_readlane_b32 s96, v34, 32 +; SI-NEXT: v_readlane_b32 s87, v34, 31 +; SI-NEXT: v_readlane_b32 s86, v34, 30 +; SI-NEXT: v_readlane_b32 s85, v34, 29 +; SI-NEXT: v_readlane_b32 s84, v34, 28 +; SI-NEXT: v_readlane_b32 s83, v34, 27 +; SI-NEXT: v_readlane_b32 s82, v34, 26 +; SI-NEXT: v_readlane_b32 s81, v34, 25 +; SI-NEXT: v_readlane_b32 s80, v34, 24 +; SI-NEXT: v_readlane_b32 s71, v34, 23 +; SI-NEXT: v_readlane_b32 s70, v34, 22 +; SI-NEXT: v_readlane_b32 s69, v34, 21 +; SI-NEXT: v_readlane_b32 s68, v34, 20 +; SI-NEXT: v_readlane_b32 s67, v34, 19 +; SI-NEXT: v_readlane_b32 s66, v34, 18 +; SI-NEXT: v_readlane_b32 s65, v34, 17 +; SI-NEXT: v_readlane_b32 s64, v34, 16 +; SI-NEXT: v_readlane_b32 s55, v34, 15 +; SI-NEXT: v_readlane_b32 s54, v34, 14 +; SI-NEXT: v_readlane_b32 s53, v34, 13 +; SI-NEXT: v_readlane_b32 s52, v34, 12 +; SI-NEXT: v_readlane_b32 s51, v34, 11 +; SI-NEXT: v_readlane_b32 s50, v34, 10 +; SI-NEXT: v_readlane_b32 s49, v34, 9 +; SI-NEXT: v_readlane_b32 s48, v34, 8 +; SI-NEXT: v_readlane_b32 s39, v34, 7 +; SI-NEXT: v_readlane_b32 s38, v34, 6 +; SI-NEXT: v_readlane_b32 s37, v34, 5 +; SI-NEXT: v_readlane_b32 s36, v34, 4 +; SI-NEXT: v_readlane_b32 s35, v34, 3 +; SI-NEXT: v_readlane_b32 s34, v34, 2 +; SI-NEXT: v_readlane_b32 s31, v34, 1 +; SI-NEXT: v_readlane_b32 s30, v34, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -213693,8 +209495,34 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v64bf16_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v3 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v27 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -213711,1233 +209539,1120 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v22 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v21 -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v20 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v19 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v18 -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v17 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v30 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v30 -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v29 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v29 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v28 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v27 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v26 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v26 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v25 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v25 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v24 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v31 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v63 -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v62 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v48 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v54 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v47 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v57 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v60 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v61 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB100_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 +; SI-NEXT: ; kill: killed $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v31 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v32 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v63 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v41 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_alignbit_b32 v2, v2, v45, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_alignbit_b32 v2, v2, v43, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_alignbit_b32 v1, v1, v40, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_alignbit_b32 v1, v1, v54, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_alignbit_b32 v1, v1, v50, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_alignbit_b32 v1, v1, v48, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_alignbit_b32 v1, v1, v19, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 +; SI-NEXT: v_alignbit_b32 v1, v1, v17, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_alignbit_b32 v1, v1, v5, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_alignbit_b32 v1, v1, v15, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 +; SI-NEXT: v_alignbit_b32 v0, v6, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v63, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v28 +; SI-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v61, v60, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 +; SI-NEXT: v_alignbit_b32 v0, v61, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v58, v56, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 +; SI-NEXT: v_alignbit_b32 v0, v58, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v47, v46, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v0, v47, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v44, v42, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v0, v44, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v41, v55, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v0, v41, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v53, v52, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v0, v53, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v51, v49, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 +; SI-NEXT: v_alignbit_b32 v0, v51, v0, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v39, v38, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v0, v39, v0, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v37, v21, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v0, v37, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v36, v23, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v36, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v35, v25, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_alignbit_b32 v0, v35, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v34, v27, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v62 +; SI-NEXT: v_alignbit_b32 v0, v34, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v29, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 +; SI-NEXT: v_alignbit_b32 v0, v33, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v32, v31, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_alignbit_b32 v0, v32, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: .LBB100_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB100_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_alignbit_b32 v4, v6, v4, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v63 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v62 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v61 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v60 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v59 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v58 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; SI-NEXT: v_alignbit_b32 v7, v21, v7, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v43 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v40 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v55 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v54 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v53 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v52 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; SI-NEXT: v_alignbit_b32 v11, v20, v11, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v51 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v50 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; SI-NEXT: v_alignbit_b32 v11, v20, v11, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v48 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; SI-NEXT: v_alignbit_b32 v3, v14, v3, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_alignbit_b32 v3, v14, v3, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v36 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; SI-NEXT: v_alignbit_b32 v3, v14, v3, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v34 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_alignbit_b32 v3, v9, v3, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v33 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_alignbit_b32 v3, v13, v3, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; SI-NEXT: v_alignbit_b32 v3, v13, v3, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v59 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_alignbit_b32 v3, v14, v3, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v14, v0, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_alignbit_b32 v32, v31, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v0, v32, v0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v33, v29, v3, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v34, v27, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v35, v25, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v36, v23, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v37, v21, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v39, v38, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v51, v49, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v53, v52, v13, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v44, v42, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v47, v46, v15, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v58, v56, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v58, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v47, v15, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v44, v7, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v61, v60, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v41, v55, v14, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v41, v14, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v53, v13, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v51, v11, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v39, v12, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v37, v10, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v8, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v35, v9, 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v34, v5, 16 +; SI-NEXT: v_alignbit_b32 v16, v61, v16, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v33, v3, 16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_alignbit_b32 v4, v63, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v4, v17, 16 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v6, v2, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v6, v18, 16 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: .LBB100_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v58 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_or_b32_e32 v30, v30, v33 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v64f16: @@ -217242,43 +212957,52 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-LABEL: bitcast_v64bf16_to_v64f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v11 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v7 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_and_b32 s6, s29, 0xffff0000 ; SI-NEXT: s_lshl_b32 s7, s29, 16 ; SI-NEXT: s_and_b32 s8, s28, 0xffff0000 @@ -217307,6 +213031,20 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_and_b32 s43, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -217324,975 +213062,1468 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v34 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v33 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v31 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v23 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v31 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v21 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v19 +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s43 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v47, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s14 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s6 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v44, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s9 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: s_cbranch_scc0 .LBB101_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v36 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v52 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v53 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v46, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v40 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v47 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v60 +; SI-NEXT: v_mov_b32_e32 v59, v61 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v22 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v47 +; SI-NEXT: v_mov_b32_e32 v8, v32 +; SI-NEXT: v_lshr_b64 v[4:5], v[16:17], 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v9, v12 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v19, v21 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v3, v54 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v4, v49 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v52 +; SI-NEXT: v_lshr_b64 v[1:2], v[10:11], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v48 +; SI-NEXT: v_mov_b32_e32 v11, v29 +; SI-NEXT: v_mov_b32_e32 v13, v24 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v11 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_mov_b32_e32 v14, v57 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[1:2], v[22:23], 16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[21:22], v[35:36], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v45 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[6:7], v[33:34], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[42:43], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[25:26], 16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_lshr_b64 v[37:38], v[41:42], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[28:29], 16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_mov_b32_e32 v24, v13 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v21 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[54:55], v[27:28], 16 +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v45 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v50 +; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_mov_b32_e32 v60, v44 +; SI-NEXT: v_mov_b32_e32 v44, v12 +; SI-NEXT: v_mov_b32_e32 v43, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v48 +; SI-NEXT: v_lshr_b64 v[47:48], v[35:36], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, v51 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshr_b64 v[18:19], v[26:27], 16 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[18:19], v[31:32], 16 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v26, v21 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[43:44], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v44, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 +; SI-NEXT: v_mov_b32_e32 v23, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v34, v3 +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_lshr_b64 v[2:3], v[33:34], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v18, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[20:21], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v21, v3 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_lshr_b64 v[2:3], v[20:21], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v20, v42 +; SI-NEXT: v_mov_b32_e32 v42, v30 +; SI-NEXT: v_lshr_b64 v[30:31], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[5:6], 16 +; SI-NEXT: v_mov_b32_e32 v32, v8 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v36 +; SI-NEXT: v_mov_b32_e32 v5, v56 +; SI-NEXT: v_lshr_b64 v[28:29], v[56:57], 16 +; SI-NEXT: v_mov_b32_e32 v56, v60 +; SI-NEXT: v_lshr_b64 v[35:36], v[60:61], 16 +; SI-NEXT: v_mov_b32_e32 v57, v14 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[53:54], 16 +; SI-NEXT: v_mov_b32_e32 v3, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_mov_b32_e32 v4, v10 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshr_b64 v[8:9], v[7:8], 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_lshr_b64 v[53:54], v[3:4], 16 +; SI-NEXT: v_mov_b32_e32 v3, v58 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[3:4], v[58:59], 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshr_b64 v[60:61], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[14:15], 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[49:50], 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[0:1], 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[51:52], 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[16:17], 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[24:25], 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[12:13], 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v40, v2 +; SI-NEXT: v_lshr_b64 v[2:3], v[62:63], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[45:46], 16 +; SI-NEXT: v_mov_b32_e32 v13, v39 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v46, v19 +; SI-NEXT: v_mov_b32_e32 v39, v34 +; SI-NEXT: v_mov_b32_e32 v1, v27 +; SI-NEXT: v_mov_b32_e32 v27, v22 +; SI-NEXT: s_branch .LBB101_3 +; SI-NEXT: .LBB101_2: +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v20 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v18 +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_mov_b32_e32 v55, v45 +; SI-NEXT: v_mov_b32_e32 v45, v43 +; SI-NEXT: v_mov_b32_e32 v13, v9 +; SI-NEXT: v_mov_b32_e32 v16, v48 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v12, v61 +; SI-NEXT: .LBB101_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v14, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB101_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v62 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[33:34], v[18:19], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[51:52], v[52:53], 16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v58, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v60, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v62, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v19 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s43 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s42 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s41 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s40 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v31, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s14 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s13 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s12 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s11 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s10 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s9 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s8 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s7 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s6 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB101_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v55 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v43 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v45 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v5 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshr_b64 v[37:38], v[0:1], 16 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v7 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v47 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v9 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v39 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[16:17], v[22:23], 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v11 +; SI-NEXT: v_lshr_b64 v[10:11], v[35:36], 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v14 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 +; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v31 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v21 +; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[60:61], 16 +; SI-NEXT: v_mov_b32_e32 v60, v37 +; SI-NEXT: v_lshr_b64 v[37:38], v[46:47], 16 +; SI-NEXT: v_mov_b32_e32 v39, v5 +; SI-NEXT: v_mov_b32_e32 v38, v4 +; SI-NEXT: v_lshr_b64 v[4:5], v[25:26], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v55 +; SI-NEXT: v_mov_b32_e32 v46, v13 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_mov_b32_e32 v11, v19 +; SI-NEXT: v_mov_b32_e32 v12, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v36 +; SI-NEXT: v_lshr_b64 v[35:36], v[42:43], 16 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v2 +; SI-NEXT: v_mov_b32_e32 v42, v30 +; SI-NEXT: v_lshr_b64 v[30:31], v[29:30], 16 +; SI-NEXT: v_mov_b32_e32 v44, v35 +; SI-NEXT: v_lshr_b64 v[31:32], v[32:33], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v50 +; SI-NEXT: v_lshr_b64 v[49:50], v[49:50], 16 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v8 +; SI-NEXT: v_lshr_b64 v[7:8], v[62:63], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v53 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_lshr_b64 v[6:7], v[27:28], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[56:57], 16 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v59 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v26 +; SI-NEXT: v_mov_b32_e32 v26, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[1:2], v[40:41], 16 +; SI-NEXT: v_mov_b32_e32 v2, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v37 +; SI-NEXT: v_lshr_b64 v[27:28], v[34:35], 16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[40:41], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[7:8], v[0:1], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_cbranch_execnz .LBB101_3 -; SI-NEXT: .LBB101_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshr_b64 v[2:3], v[36:37], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v14 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v53 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v52 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[7:8], v[5:6], 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[36:37], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[47:48], v[9:10], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[2:3], v[13:14], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[2:3], v[38:39], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[2:3], v[45:46], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[2:3], v[21:22], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[2:3], v[15:16], 16 +; SI-NEXT: v_mov_b32_e32 v20, v50 +; SI-NEXT: v_mov_b32_e32 v21, v51 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[2:3], v[20:21], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v20, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: v_mov_b32_e32 v2, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: .LBB101_5: ; %end +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v36 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v40 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v53 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v54 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v55 -; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v42 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v43 -; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v46 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v47 -; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v56 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v57 -; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v10 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v59 -; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v60 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v61 -; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 -; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 -; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 -; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v41 -; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v42 -; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v43 -; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v44 -; SI-NEXT: v_add_f32_e32 v45, 0x40c00000, v45 -; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v46 -; SI-NEXT: v_add_f32_e32 v47, 0x40c00000, v47 -; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v56 -; SI-NEXT: v_add_f32_e32 v57, 0x40c00000, v57 -; SI-NEXT: v_add_f32_e32 v58, 0x40c00000, v58 -; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v59 -; SI-NEXT: v_add_f32_e32 v60, 0x40c00000, v60 -; SI-NEXT: v_add_f32_e32 v61, 0x40c00000, v61 -; SI-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 -; SI-NEXT: v_add_f32_e32 v63, 0x40c00000, v63 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: .LBB101_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v52, v18 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v6, v6, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v60 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v25 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v18, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v20, v20, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v27 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v47 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v29 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v51, v17 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v35 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v27, v27, v29 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 -; SI-NEXT: v_or_b32_e32 v18, v51, v18 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v52, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v36 -; SI-NEXT: v_or_b32_e32 v19, v53, v19 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v50 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v34 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v31 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v35 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v38 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v39 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v21, v52, v21 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v30 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v49 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v37 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_or_b32_e32 v20, v51, v20 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v33 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB101_4: -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: s_branch .LBB101_2 ; ; VI-LABEL: bitcast_v64bf16_to_v64f16_scalar: ; VI: ; %bb.0: @@ -221088,583 +217319,414 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; SI-NEXT: v_mov_b32_e32 v53, v3 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v63 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v44 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v46 -; SI-NEXT: v_mov_b32_e32 v55, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_mov_b32_e32 v51, v1 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v56 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v58 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v62 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v11, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v50, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v46, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v2 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v14 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v60 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr36 @@ -221689,626 +217751,641 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: .LBB102_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 -; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 -; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 -; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v56 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v45 -; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v44 -; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 -; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v33 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v46 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v43 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v53 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v51 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v50 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v58 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 +; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 +; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v12 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_mov_b32_e32 v27, v9 -; SI-NEXT: v_mov_b32_e32 v26, v11 -; SI-NEXT: v_mov_b32_e32 v25, v12 -; SI-NEXT: v_mov_b32_e32 v28, v10 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: .LBB102_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v62 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v19 -; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v17 -; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v15 -; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v14 -; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v26 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v27 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_alignbit_b32 v23, v23, v24, 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_alignbit_b32 v24, v24, v25, 16 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_alignbit_b32 v25, v25, v26, 16 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_alignbit_b32 v26, v26, v27, 16 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_alignbit_b32 v27, v27, v28, 16 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_alignbit_b32 v28, v28, v29, 16 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_alignbit_b32 v29, v29, v30, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v33 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_alignbit_b32 v31, v31, v32, 16 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -222551,7 +218628,21 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-LABEL: bitcast_v64f16_to_v64bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: s_lshr_b32 s15, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -222568,1194 +218659,726 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v43, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v62, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v62 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v53, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v19 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v7 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v51, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v41, v19 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v17, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_cbranch_scc0 .LBB103_2 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: s_cbranch_scc0 .LBB103_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v6 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v51 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v43 -; SI-NEXT: v_mov_b32_e32 v42, v60 -; SI-NEXT: s_branch .LBB103_3 -; SI-NEXT: .LBB103_2: -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; kill: killed $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: .LBB103_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v53, v15 -; SI-NEXT: v_mov_b32_e32 v56, v13 -; SI-NEXT: v_mov_b32_e32 v8, v39 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v27, v38 -; SI-NEXT: v_mov_b32_e32 v13, v0 -; SI-NEXT: v_mov_b32_e32 v38, v24 -; SI-NEXT: v_mov_b32_e32 v39, v12 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v14 -; SI-NEXT: v_mov_b32_e32 v47, v11 -; SI-NEXT: v_mov_b32_e32 v23, v40 -; SI-NEXT: v_mov_b32_e32 v40, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v25, v16 -; SI-NEXT: v_mov_b32_e32 v16, v9 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v46, v10 -; SI-NEXT: v_mov_b32_e32 v10, v2 -; SI-NEXT: v_mov_b32_e32 v6, v31 -; SI-NEXT: v_mov_b32_e32 v22, v29 -; SI-NEXT: v_mov_b32_e32 v26, v33 -; SI-NEXT: v_mov_b32_e32 v2, v49 -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_cbranch_vccnz .LBB103_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v25 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_mov_b32_e32 v19, v34 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v11 +; SI-NEXT: s_lshl_b32 s44, s16, 16 +; SI-NEXT: s_lshl_b32 s45, s6, 16 +; SI-NEXT: s_lshl_b32 s46, s17, 16 +; SI-NEXT: s_lshl_b32 s47, s7, 16 +; SI-NEXT: s_lshl_b32 s56, s18, 16 +; SI-NEXT: s_lshl_b32 s57, s8, 16 +; SI-NEXT: s_lshl_b32 s58, s19, 16 +; SI-NEXT: s_lshl_b32 s59, s9, 16 +; SI-NEXT: s_lshl_b32 s60, s20, 16 +; SI-NEXT: s_lshl_b32 s61, s10, 16 +; SI-NEXT: s_lshl_b32 s62, s21, 16 +; SI-NEXT: s_lshl_b32 s63, s11, 16 +; SI-NEXT: s_lshl_b32 s72, s22, 16 +; SI-NEXT: s_lshl_b32 s73, s12, 16 +; SI-NEXT: s_lshl_b32 s74, s23, 16 +; SI-NEXT: s_lshl_b32 s75, s13, 16 +; SI-NEXT: s_lshl_b32 s76, s24, 16 +; SI-NEXT: s_lshl_b32 s77, s14, 16 +; SI-NEXT: s_lshl_b32 s78, s25, 16 +; SI-NEXT: s_lshl_b32 s79, s15, 16 +; SI-NEXT: s_lshl_b32 s88, s26, 16 +; SI-NEXT: s_lshl_b32 s89, s40, 16 +; SI-NEXT: s_lshl_b32 s90, s27, 16 +; SI-NEXT: s_lshl_b32 s91, s41, 16 +; SI-NEXT: s_lshl_b32 s92, s28, 16 +; SI-NEXT: s_lshl_b32 s93, s42, 16 +; SI-NEXT: s_lshl_b32 s94, s29, 16 +; SI-NEXT: s_lshl_b32 s95, s43, 16 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_mov_b32_e32 v27, v32 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v32 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v18, v33 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v5 +; SI-NEXT: v_mov_b32_e32 v21, v35 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v35 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v22, v36 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v38 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, v48 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; SI-NEXT: v_mov_b32_e32 v26, v49 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_mov_b32_e32 v41, v43 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v43 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v44, v45 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_mov_b32_e32 v37, v46 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v13 +; SI-NEXT: v_mov_b32_e32 v57, v56 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v14 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_mov_b32_e32 v54, v30 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v20 +; SI-NEXT: s_cbranch_execnz .LBB103_4 +; SI-NEXT: .LBB103_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v52 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v33 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v37 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v34 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v50 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v63 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v21 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v22 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v47 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v56 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v42 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v43 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v19 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v57 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v56, v53 -; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v55 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v56 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v47 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v61 -; SI-NEXT: v_mov_b32_e32 v18, v53 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v34, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v63 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v61 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s29 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s43 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s28 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s27 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s26 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s40 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s24 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s23 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v1 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s21 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s12 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s20 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: .LBB103_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v16 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v38 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v44 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_branch .LBB103_5 +; SI-NEXT: .LBB103_3: +; SI-NEXT: v_mov_b32_e32 v54, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_mov_b32_e32 v27, v32 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: v_mov_b32_e32 v57, v56 +; SI-NEXT: v_mov_b32_e32 v37, v46 +; SI-NEXT: v_mov_b32_e32 v44, v45 +; SI-NEXT: v_mov_b32_e32 v41, v43 +; SI-NEXT: v_mov_b32_e32 v26, v49 +; SI-NEXT: v_mov_b32_e32 v25, v48 +; SI-NEXT: v_mov_b32_e32 v23, v38 +; SI-NEXT: v_mov_b32_e32 v22, v36 +; SI-NEXT: v_mov_b32_e32 v21, v35 +; SI-NEXT: v_mov_b32_e32 v19, v34 +; SI-NEXT: v_mov_b32_e32 v18, v33 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: s_branch .LBB103_2 +; SI-NEXT: .LBB103_4: +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v27, s95 +; SI-NEXT: v_mov_b32_e32 v26, s94 +; SI-NEXT: v_mov_b32_e32 v25, s93 +; SI-NEXT: v_mov_b32_e32 v24, s92 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, s91 +; SI-NEXT: v_mov_b32_e32 v22, s90 +; SI-NEXT: v_mov_b32_e32 v21, s89 +; SI-NEXT: v_mov_b32_e32 v20, s88 +; SI-NEXT: v_mov_b32_e32 v19, s79 +; SI-NEXT: v_mov_b32_e32 v18, s78 +; SI-NEXT: v_mov_b32_e32 v17, s77 +; SI-NEXT: v_mov_b32_e32 v63, v47 +; SI-NEXT: v_mov_b32_e32 v47, s76 +; SI-NEXT: v_mov_b32_e32 v15, s75 +; SI-NEXT: v_mov_b32_e32 v14, s74 +; SI-NEXT: v_mov_b32_e32 v13, s73 +; SI-NEXT: v_mov_b32_e32 v12, s72 +; SI-NEXT: v_mov_b32_e32 v11, s63 +; SI-NEXT: v_mov_b32_e32 v10, s62 +; SI-NEXT: v_mov_b32_e32 v9, s61 +; SI-NEXT: v_mov_b32_e32 v57, s60 +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v45, s59 +; SI-NEXT: v_mov_b32_e32 v6, s58 +; SI-NEXT: v_mov_b32_e32 v5, s57 +; SI-NEXT: v_mov_b32_e32 v4, s56 +; SI-NEXT: v_mov_b32_e32 v3, s47 +; SI-NEXT: v_mov_b32_e32 v2, s46 +; SI-NEXT: v_mov_b32_e32 v1, s45 +; SI-NEXT: v_mov_b32_e32 v0, s44 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v48, v33 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v49 +; SI-NEXT: v_mov_b32_e32 v49, v35 +; SI-NEXT: v_mov_b32_e32 v23, v46 +; SI-NEXT: v_mov_b32_e32 v46, v53 +; SI-NEXT: v_mov_b32_e32 v53, v36 +; SI-NEXT: v_mov_b32_e32 v7, v38 +; SI-NEXT: .LBB103_5: ; %end +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v18 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v58 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v42 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v40 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 -; SI-NEXT: v_lshr_b64 v[45:46], v[46:47], 16 -; SI-NEXT: v_lshr_b64 v[46:47], v[2:3], 16 -; SI-NEXT: v_lshr_b64 v[2:3], v[7:8], 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v4 ; SI-NEXT: v_lshr_b64 v[4:5], v[5:6], 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v59 +; SI-NEXT: v_lshr_b64 v[33:34], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[35:36], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[37:38], 16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v63 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshr_b64 v[36:37], v[36:37], 16 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_lshr_b64 v[7:8], v[14:15], 16 -; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[10:11], v[20:21], 16 -; SI-NEXT: v_lshr_b64 v[11:12], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[7:8], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[10:11], 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[12:13], 16 ; SI-NEXT: v_lshr_b64 v[12:13], v[24:25], 16 ; SI-NEXT: v_lshr_b64 v[13:14], v[26:27], 16 ; SI-NEXT: v_lshr_b64 v[14:15], v[28:29], 16 ; SI-NEXT: v_lshr_b64 v[15:16], v[30:31], 16 ; SI-NEXT: v_lshr_b64 v[16:17], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[56:57], 16 -; SI-NEXT: v_lshr_b64 v[18:19], v[58:59], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[60:61], 16 -; SI-NEXT: v_lshr_b64 v[20:21], v[62:63], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[22:23], v[34:35], 16 -; SI-NEXT: v_lshr_b64 v[23:24], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[50:51], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[41:42], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[43:44], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[0:1], 16 -; SI-NEXT: v_mov_b32_e32 v0, v45 -; SI-NEXT: v_mov_b32_e32 v1, v46 +; SI-NEXT: v_lshr_b64 v[17:18], v[42:43], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[39:40], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[53:54], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[51:52], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[49:50], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[44:45], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[46:47], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[55:56], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[62:63], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[57:58], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[60:61], 16 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -223772,7 +219395,26 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[30:31], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v2, v35 +; SI-NEXT: v_mov_b32_e32 v3, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[31:32], v[0:1], 16 +; SI-NEXT: v_mov_b32_e32 v0, v33 +; SI-NEXT: v_mov_b32_e32 v1, v34 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v64bf16_scalar: @@ -233338,13 +228980,8 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v64f16_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 -; SI-NEXT: v_mov_b32_e32 v50, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_mov_b32_e32 v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -233361,562 +228998,409 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v61 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v62 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v41 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v63 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v31 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v61 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v61, v55 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v37 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB108_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v31, v31, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v36 +; SI-NEXT: v_or_b32_e32 v29, v29, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; SI-NEXT: v_or_b32_e32 v25, v25, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v29, v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v27, v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v62 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v25, v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v50 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v41 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v50 -; SI-NEXT: v_or_b32_e32 v23, v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_or_b32_e32 v21, v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_or_b32_e32 v19, v0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_or_b32_e32 v17, v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_or_b32_e32 v15, v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_or_b32_e32 v13, v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v32 +; SI-NEXT: v_or_b32_e32 v23, v23, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v39 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v32 +; SI-NEXT: v_or_b32_e32 v21, v21, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 +; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 +; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 ; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v50 ; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 -; SI-NEXT: v_or_b32_e32 v11, v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v35 ; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 ; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 -; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_or_b32_e32 v9, v0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v48 +; SI-NEXT: v_or_b32_e32 v17, v17, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v51 ; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v35 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v50 +; SI-NEXT: v_or_b32_e32 v15, v15, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v35 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 -; SI-NEXT: v_or_b32_e32 v7, v0, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v51 +; SI-NEXT: v_or_b32_e32 v13, v13, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; SI-NEXT: v_or_b32_e32 v5, v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v35 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v53 +; SI-NEXT: v_or_b32_e32 v11, v11, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_or_b32_e32 v3, v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v35 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_or_b32_e32 v1, v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v54 +; SI-NEXT: v_or_b32_e32 v9, v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v50, v37, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v61 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v62 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v35 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 +; SI-NEXT: v_or_b32_e32 v7, v7, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v35 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v43 +; SI-NEXT: v_or_b32_e32 v5, v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v44 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_or_b32_e32 v41, v41, v61 -; SI-NEXT: v_or_b32_e32 v55, v55, v60 -; SI-NEXT: v_or_b32_e32 v53, v53, v59 -; SI-NEXT: v_or_b32_e32 v52, v52, v58 -; SI-NEXT: v_or_b32_e32 v51, v51, v57 -; SI-NEXT: v_or_b32_e32 v49, v49, v56 -; SI-NEXT: v_or_b32_e32 v48, v48, v47 -; SI-NEXT: v_or_b32_e32 v39, v39, v46 -; SI-NEXT: v_or_b32_e32 v38, v38, v45 -; SI-NEXT: v_or_b32_e32 v36, v36, v43 -; SI-NEXT: v_or_b32_e32 v34, v34, v42 -; SI-NEXT: v_or_b32_e32 v35, v35, v54 -; SI-NEXT: v_or_b32_e32 v33, v33, v40 -; SI-NEXT: v_alignbit_b32 v63, v1, v0, 16 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v40, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v32 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v63 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v59 +; SI-NEXT: v_or_b32_e32 v10, v10, v58 +; SI-NEXT: v_or_b32_e32 v12, v12, v57 +; SI-NEXT: v_or_b32_e32 v14, v14, v56 +; SI-NEXT: v_or_b32_e32 v16, v16, v47 +; SI-NEXT: v_or_b32_e32 v18, v18, v45 +; SI-NEXT: v_or_b32_e32 v20, v20, v42 +; SI-NEXT: v_or_b32_e32 v22, v22, v55 +; SI-NEXT: v_or_b32_e32 v24, v24, v52 +; SI-NEXT: v_or_b32_e32 v26, v26, v49 +; SI-NEXT: v_or_b32_e32 v28, v28, v38 +; SI-NEXT: v_alignbit_b32 v63, v1, v35, 16 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_alignbit_b32 v62, v3, v62, 16 +; SI-NEXT: v_alignbit_b32 v61, v5, v61, 16 ; SI-NEXT: v_alignbit_b32 v60, v7, v60, 16 ; SI-NEXT: v_alignbit_b32 v59, v9, v59, 16 ; SI-NEXT: v_alignbit_b32 v58, v11, v58, 16 ; SI-NEXT: v_alignbit_b32 v57, v13, v57, 16 ; SI-NEXT: v_alignbit_b32 v56, v15, v56, 16 ; SI-NEXT: v_alignbit_b32 v47, v17, v47, 16 -; SI-NEXT: v_alignbit_b32 v46, v19, v46, 16 -; SI-NEXT: v_alignbit_b32 v45, v21, v45, 16 -; SI-NEXT: v_alignbit_b32 v43, v25, v43, 16 -; SI-NEXT: v_alignbit_b32 v42, v27, v42, 16 -; SI-NEXT: v_alignbit_b32 v54, v29, v54, 16 -; SI-NEXT: v_alignbit_b32 v40, v31, v40, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 -; SI-NEXT: v_or_b32_e32 v62, v62, v37 -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v45, v19, v45, 16 +; SI-NEXT: v_alignbit_b32 v42, v21, v42, 16 +; SI-NEXT: v_alignbit_b32 v55, v23, v55, 16 +; SI-NEXT: v_alignbit_b32 v52, v25, v52, 16 +; SI-NEXT: v_alignbit_b32 v49, v27, v49, 16 +; SI-NEXT: v_alignbit_b32 v38, v29, v38, 16 +; SI-NEXT: v_alignbit_b32 v36, v31, v32, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 -; SI-NEXT: v_or_b32_e32 v62, v62, v44 -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v62, v3, v61, 16 -; SI-NEXT: v_alignbit_b32 v61, v5, v37, 16 -; SI-NEXT: v_alignbit_b32 v44, v23, v44, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 ; SI-NEXT: .LBB108_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v63 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v62 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v2, v2, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v62 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v44 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v61 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v43 +; SI-NEXT: v_or_b32_e32 v5, v5, v32 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v60 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v41 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v59 +; SI-NEXT: v_or_b32_e32 v8, v8, v32 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v40 +; SI-NEXT: v_or_b32_e32 v9, v9, v32 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v32 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v54 +; SI-NEXT: v_or_b32_e32 v11, v11, v32 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v57 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v53 +; SI-NEXT: v_or_b32_e32 v13, v13, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v56 +; SI-NEXT: v_or_b32_e32 v14, v14, v32 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v15, v15, v32 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v47 +; SI-NEXT: v_or_b32_e32 v16, v16, v32 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v50 +; SI-NEXT: v_or_b32_e32 v17, v17, v32 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v45 +; SI-NEXT: v_or_b32_e32 v18, v18, v32 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v38 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v43 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v42 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v40 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v26, v26, v34 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v54 -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_or_b32_e32 v24, v24, v36 -; SI-NEXT: v_or_b32_e32 v28, v28, v34 -; SI-NEXT: v_or_b32_e32 v30, v30, v33 -; SI-NEXT: v_or_b32_e32 v31, v31, v32 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v60 -; SI-NEXT: v_or_b32_e32 v6, v6, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v59 -; SI-NEXT: v_or_b32_e32 v8, v8, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v58 -; SI-NEXT: v_or_b32_e32 v10, v10, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v57 -; SI-NEXT: v_or_b32_e32 v12, v12, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v56 -; SI-NEXT: v_or_b32_e32 v14, v14, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v47 -; SI-NEXT: v_or_b32_e32 v16, v16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v46 -; SI-NEXT: v_or_b32_e32 v18, v18, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v45 -; SI-NEXT: v_or_b32_e32 v20, v20, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v48 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v42 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -233933,10 +229417,43 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v32 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v39 +; SI-NEXT: v_or_b32_e32 v21, v21, v32 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v22, v22, v37 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_or_b32_e32 v23, v23, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v24, v24, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v34 +; SI-NEXT: v_or_b32_e32 v25, v25, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 +; SI-NEXT: v_or_b32_e32 v26, v26, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v38 +; SI-NEXT: v_or_b32_e32 v28, v28, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v35 +; SI-NEXT: v_or_b32_e32 v29, v29, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v36 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v37 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v64i16: @@ -234177,83 +229694,10 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-LABEL: bitcast_v64f16_to_v64i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -234270,652 +229714,613 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s22 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v60, v17 +; SI-NEXT: v_mov_b32_e32 v20, v0 +; SI-NEXT: v_mov_b32_e32 v42, v15 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s20 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v61, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v39 +; SI-NEXT: s_lshr_b32 s8, s29, 16 +; SI-NEXT: s_lshr_b32 s6, s28, 16 +; SI-NEXT: s_lshr_b32 s10, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s12, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s15, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s18, 16 +; SI-NEXT: s_lshr_b32 s43, s17, 16 +; SI-NEXT: s_lshr_b32 s42, s16, 16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB109_2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB109_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_branch .LBB109_3 -; SI-NEXT: .LBB109_2: -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: .LBB109_3: ; %Flow -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v40, v61 -; SI-NEXT: v_mov_b32_e32 v34, v54 -; SI-NEXT: s_cbranch_vccnz .LBB109_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v30 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v52 -; SI-NEXT: v_mov_b32_e32 v52, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_mov_b32_e32 v24, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v36 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_mov_b32_e32 v28, v53 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: s_cbranch_execnz .LBB109_4 +; SI-NEXT: .LBB109_2: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s41 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s40 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s15 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_mov_b32_e32 v52, v1 +; SI-NEXT: v_mov_b32_e32 v51, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s19 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s21 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_or_b32_e32 v40, v0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_or_b32_e32 v46, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s11 +; SI-NEXT: v_or_b32_e32 v26, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_or_b32_e32 v54, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s10 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v33 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_or_b32_e32 v44, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_or_b32_e32 v38, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_mov_b32_e32 v58, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v8 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v29, v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v41 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v50 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v38 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v42 -; SI-NEXT: v_or_b32_e32 v31, v31, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v36, v39, v2 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v63 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v45, v48, v4 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v40, v39, v8 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v58 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_or_b32_e32 v31, v5, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_or_b32_e32 v48, v11, v12 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v37 -; SI-NEXT: v_mov_b32_e32 v49, v55 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v63 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v16 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v6 +; SI-NEXT: v_mov_b32_e32 v5, v28 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mov_b32_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v24, v1, v2 +; SI-NEXT: v_mov_b32_e32 v62, v24 +; SI-NEXT: v_mov_b32_e32 v61, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, v60 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v33, v3 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 +; SI-NEXT: v_mov_b32_e32 v2, v20 +; SI-NEXT: v_or_b32_e32 v3, v18, v51 +; SI-NEXT: v_mov_b32_e32 v20, v47 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_or_b32_e32 v36, v13, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v55 +; SI-NEXT: v_mov_b32_e32 v56, v40 +; SI-NEXT: v_mov_b32_e32 v55, v39 +; SI-NEXT: v_or_b32_e32 v10, v19, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_or_b32_e32 v42, v15, v22 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_mov_b32_e32 v14, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_or_b32_e32 v16, v19, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, s20 +; SI-NEXT: v_mov_b32_e32 v15, v26 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_mov_b32_e32 v21, v48 +; SI-NEXT: v_or_b32_e32 v60, v17, v22 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v38, v38, v0 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v13, v18, v45 +; SI-NEXT: v_or_b32_e32 v22, v28, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s28 +; SI-NEXT: v_or_b32_e32 v12, v27, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v39, v26, v37 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v23 +; SI-NEXT: v_or_b32_e32 v47, v27, v61 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v19 +; SI-NEXT: v_mov_b32_e32 v18, v30 +; SI-NEXT: v_mov_b32_e32 v19, v31 +; SI-NEXT: v_or_b32_e32 v23, v25, v32 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[61:62], 16 +; SI-NEXT: v_lshr_b64 v[61:62], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v26, v21 +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshr_b64 v[62:63], v[0:1], 16 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshr_b64 v[56:57], v[6:7], 16 -; SI-NEXT: v_mov_b32_e32 v57, v43 -; SI-NEXT: v_mov_b32_e32 v43, v44 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_mov_b32_e32 v63, v35 -; SI-NEXT: v_lshr_b64 v[46:47], v[8:9], 16 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_lshr_b64 v[60:61], v[2:3], 16 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_or_b32_e32 v39, v48, v10 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 +; SI-NEXT: v_or_b32_e32 v2, v24, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_or_b32_e32 v38, v38, v6 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v23, v5 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v18, v34 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_or_b32_e32 v39, v39, v14 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_or_b32_e32 v2, v27, v8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v39, v48, v16 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v49 -; SI-NEXT: v_or_b32_e32 v38, v38, v12 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v52 -; SI-NEXT: v_or_b32_e32 v34, v39, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v36 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v55, v48, v22 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_or_b32_e32 v52, v38, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v58 -; SI-NEXT: v_lshr_b64 v[58:59], v[4:5], 16 -; SI-NEXT: v_mov_b32_e32 v59, v51 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v28 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v53 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v48, v36 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v36 -; SI-NEXT: v_or_b32_e32 v36, v38, v24 -; SI-NEXT: v_or_b32_e32 v38, v39, v26 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v48, v28 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v44, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v4, v28, v20 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v49 +; SI-NEXT: v_mov_b32_e32 v48, v51 +; SI-NEXT: v_mov_b32_e32 v49, v52 +; SI-NEXT: v_mov_b32_e32 v51, v55 +; SI-NEXT: v_mov_b32_e32 v52, v56 +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 +; SI-NEXT: v_mov_b32_e32 v48, v19 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v44 +; SI-NEXT: v_lshr_b64 v[55:56], v[32:33], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[56:57], v[5:6], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v49, v30 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v3, v47 +; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_lshr_b64 v[32:33], v[35:36], 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v40, v23 +; SI-NEXT: v_or_b32_e32 v23, v27, v35 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[38:39], v[12:13], 16 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[35:36], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v18, v33 -; SI-NEXT: v_mov_b32_e32 v33, v50 -; SI-NEXT: v_lshr_b64 v[50:51], v[20:21], 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[38:39], v[14:15], 16 -; SI-NEXT: v_mov_b32_e32 v36, v52 -; SI-NEXT: v_lshr_b64 v[51:52], v[26:27], 16 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshr_b64 v[38:39], v[16:17], 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[51:52], v[28:29], 16 -; SI-NEXT: v_lshr_b64 v[47:48], v[10:11], 16 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[48:49], v[22:23], 16 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshr_b64 v[38:39], v[24:25], 16 +; SI-NEXT: v_or_b32_e32 v23, v30, v41 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[27:28], v[51:52], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[45:46], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v40, v59 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v24, v15 +; SI-NEXT: v_mov_b32_e32 v23, v14 +; SI-NEXT: v_lshr_b64 v[14:15], v[23:24], 16 +; SI-NEXT: v_mov_b32_e32 v40, v7 +; SI-NEXT: v_mov_b32_e32 v7, v12 +; SI-NEXT: v_mov_b32_e32 v15, v11 +; SI-NEXT: v_lshr_b64 v[11:12], v[53:54], 16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v12, v16 +; SI-NEXT: v_lshr_b64 v[16:17], v[43:44], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[37:38], 16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v58 +; SI-NEXT: v_lshr_b64 v[57:58], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[44:45], v[8:9], 16 +; SI-NEXT: v_mov_b32_e32 v35, v63 +; SI-NEXT: v_lshr_b64 v[62:63], v[41:42], 16 +; SI-NEXT: v_lshr_b64 v[58:59], v[59:60], 16 +; SI-NEXT: v_mov_b32_e32 v8, v50 +; SI-NEXT: v_mov_b32_e32 v50, v46 +; SI-NEXT: v_mov_b32_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v46, v32 +; SI-NEXT: v_mov_b32_e32 v20, v22 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: s_branch .LBB109_5 +; SI-NEXT: .LBB109_3: +; SI-NEXT: s_branch .LBB109_2 +; SI-NEXT: .LBB109_4: +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v0, s41 +; SI-NEXT: v_mov_b32_e32 v1, s27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v37, s8 +; SI-NEXT: v_mov_b32_e32 v43, s10 +; SI-NEXT: v_mov_b32_e32 v53, s12 +; SI-NEXT: v_mov_b32_e32 v31, s14 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v8, s15 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v4, s43 +; SI-NEXT: v_mov_b32_e32 v49, s17 +; SI-NEXT: v_mov_b32_e32 v52, s19 +; SI-NEXT: v_mov_b32_e32 v50, s21 +; SI-NEXT: v_mov_b32_e32 v24, s23 +; SI-NEXT: v_mov_b32_e32 v54, s25 +; SI-NEXT: v_mov_b32_e32 v38, s29 +; SI-NEXT: v_mov_b32_e32 v3, v20 +; SI-NEXT: v_mov_b32_e32 v39, s28 +; SI-NEXT: v_mov_b32_e32 v47, v28 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v58, v41 +; SI-NEXT: v_mov_b32_e32 v62, v22 +; SI-NEXT: v_mov_b32_e32 v41, v7 +; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v46, v21 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v44, v12 +; SI-NEXT: v_mov_b32_e32 v26, v11 +; SI-NEXT: v_mov_b32_e32 v7, s24 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s20 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v10, s18 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[51:52], v[30:31], 16 +; SI-NEXT: v_mov_b32_e32 v55, v17 +; SI-NEXT: v_mov_b32_e32 v25, v16 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v1, s42 +; SI-NEXT: v_mov_b32_e32 v15, v27 +; SI-NEXT: v_mov_b32_e32 v27, s40 +; SI-NEXT: v_mov_b32_e32 v30, s13 +; SI-NEXT: v_mov_b32_e32 v14, s11 +; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_mov_b32_e32 v16, s7 +; SI-NEXT: v_mov_b32_e32 v28, s6 +; SI-NEXT: v_mov_b32_e32 v40, v33 +; SI-NEXT: v_mov_b32_e32 v21, v32 ; SI-NEXT: .LBB109_5: ; %end -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v62 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v45 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v40 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v48 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v38 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v33, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v51, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v43 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v56 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v5, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v57 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v32, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v41 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v47 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 +; SI-NEXT: v_or_b32_e32 v34, v10, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v20 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v37 +; SI-NEXT: v_mov_b32_e32 v7, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v39 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v13, v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v37 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v39 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v39, v1, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v17, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v44 -; SI-NEXT: v_or_b32_e32 v24, v24, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v63 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v33 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v26, v26, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v53 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v33 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v28, v28, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v18, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_or_b32_e32 v19, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v30, v30, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v20, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v22, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v23, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_mov_b32_e32 v9, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v24, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_or_b32_e32 v25, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v26, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v27, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v28, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: v_or_b32_e32 v29, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_mov_b32_e32 v15, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v30, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -234932,8 +230337,12 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v31, v1, v3 +; SI-NEXT: v_mov_b32_e32 v1, v33 +; SI-NEXT: v_mov_b32_e32 v3, v51 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v64i16_scalar: @@ -235249,6 +230658,76 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v64i16_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -235265,949 +230744,867 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v24 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v40 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v0 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v56 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v37 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v46 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v59 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v42 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v52 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v58 +; SI-NEXT: v_or_b32_e32 v61, v1, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_or_b32_e32 v47, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 +; SI-NEXT: v_or_b32_e32 v62, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_or_b32_e32 v59, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v57, v1, v3 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_alignbit_b32 v1, v61, v5, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v56, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_alignbit_b32 v1, v47, v7, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v46, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v62, v9, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v44, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v59, v11, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v43, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v57, v13, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v42, v1, v48 +; SI-NEXT: v_alignbit_b32 v1, v56, v15, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v41, v1, v49 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_alignbit_b32 v1, v46, v17, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v55, v1, v39 +; SI-NEXT: v_alignbit_b32 v1, v44, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v53, v1, v4 +; SI-NEXT: v_alignbit_b32 v1, v43, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v4, v1, v6 +; SI-NEXT: v_alignbit_b32 v1, v42, v32, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v3, v1, v8 +; SI-NEXT: v_alignbit_b32 v1, v41, v36, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v50, v1, v31 +; SI-NEXT: v_alignbit_b32 v1, v55, v45, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v53, v38, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v4, v33, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v3, v0, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v50, v2, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: .LBB110_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 -; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 +; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v30 +; SI-NEXT: v_or_b32_e32 v8, v8, v28 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v6, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v30 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v56 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v57 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v59 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_add_i32_e32 v62, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v62 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v47 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v0, v61, v3, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v47, v5, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v62, v7, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v59, v9, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v57, v11, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v56, v10, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v46, v12, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v44, v14, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v43, v16, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v42, v18, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v41, v20, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v55, v4, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v53, v24, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v6, v8, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v2, v29, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v50, v28, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: .LBB110_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v31 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v64f16: @@ -236447,746 +231844,812 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-LABEL: bitcast_v64i16_to_v64f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v32, s30, 0 +; SI-NEXT: v_writelane_b32 v32, s31, 1 +; SI-NEXT: v_writelane_b32 v32, s34, 2 +; SI-NEXT: v_writelane_b32 v32, s35, 3 +; SI-NEXT: v_writelane_b32 v32, s36, 4 +; SI-NEXT: v_writelane_b32 v32, s37, 5 +; SI-NEXT: v_writelane_b32 v32, s38, 6 +; SI-NEXT: v_writelane_b32 v32, s39, 7 +; SI-NEXT: v_writelane_b32 v32, s48, 8 +; SI-NEXT: v_writelane_b32 v32, s49, 9 +; SI-NEXT: v_writelane_b32 v32, s50, 10 +; SI-NEXT: v_writelane_b32 v32, s51, 11 +; SI-NEXT: v_writelane_b32 v32, s52, 12 +; SI-NEXT: v_writelane_b32 v32, s53, 13 +; SI-NEXT: v_writelane_b32 v32, s54, 14 +; SI-NEXT: v_writelane_b32 v32, s55, 15 +; SI-NEXT: v_writelane_b32 v32, s64, 16 +; SI-NEXT: v_writelane_b32 v32, s65, 17 +; SI-NEXT: v_writelane_b32 v32, s66, 18 +; SI-NEXT: v_writelane_b32 v32, s67, 19 +; SI-NEXT: v_writelane_b32 v32, s68, 20 +; SI-NEXT: v_writelane_b32 v32, s69, 21 +; SI-NEXT: v_writelane_b32 v32, s70, 22 +; SI-NEXT: v_writelane_b32 v32, s71, 23 +; SI-NEXT: v_writelane_b32 v32, s80, 24 +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v32, s81, 25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v33, s4, 0 +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: v_writelane_b32 v32, s82, 26 +; SI-NEXT: v_writelane_b32 v33, s4, 1 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: v_writelane_b32 v32, s83, 27 +; SI-NEXT: v_writelane_b32 v33, s4, 2 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_writelane_b32 v32, s84, 28 +; SI-NEXT: v_writelane_b32 v33, s4, 3 +; SI-NEXT: v_writelane_b32 v32, s85, 29 +; SI-NEXT: v_writelane_b32 v33, s29, 4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_writelane_b32 v32, s86, 30 +; SI-NEXT: v_writelane_b32 v33, s4, 5 +; SI-NEXT: v_writelane_b32 v32, s87, 31 +; SI-NEXT: v_writelane_b32 v33, s27, 6 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_writelane_b32 v32, s96, 32 +; SI-NEXT: v_writelane_b32 v33, s4, 7 +; SI-NEXT: v_writelane_b32 v32, s97, 33 +; SI-NEXT: v_writelane_b32 v33, s25, 8 +; SI-NEXT: v_writelane_b32 v32, s98, 34 +; SI-NEXT: v_writelane_b32 v33, s23, 9 +; SI-NEXT: v_writelane_b32 v32, s99, 35 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; SI-NEXT: s_lshr_b32 s48, s29, 16 +; SI-NEXT: s_lshr_b32 s76, s27, 16 +; SI-NEXT: s_lshr_b32 s67, s25, 16 +; SI-NEXT: s_lshr_b32 s35, s24, 16 +; SI-NEXT: s_lshr_b32 s65, s23, 16 +; SI-NEXT: s_lshr_b32 s31, s22, 16 +; SI-NEXT: s_lshr_b32 s55, s21, 16 +; SI-NEXT: s_lshr_b32 s95, s20, 16 +; SI-NEXT: v_writelane_b32 v33, s19, 10 +; SI-NEXT: s_lshr_b32 s53, s19, 16 +; SI-NEXT: s_lshr_b32 s93, s18, 16 +; SI-NEXT: s_lshr_b32 s51, s17, 16 +; SI-NEXT: s_lshr_b32 s99, s16, 16 +; SI-NEXT: v_writelane_b32 v33, s17, 11 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: v_writelane_b32 v33, s16, 12 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_mov_b32 s56, s18 +; SI-NEXT: v_writelane_b32 v33, s4, 13 +; SI-NEXT: s_mov_b32 s58, s22 +; SI-NEXT: v_writelane_b32 v33, s56, 14 +; SI-NEXT: s_mov_b32 s59, s24 +; SI-NEXT: v_writelane_b32 v33, s58, 15 +; SI-NEXT: s_mov_b32 s97, s26 +; SI-NEXT: v_writelane_b32 v33, s59, 16 +; SI-NEXT: s_mov_b32 s85, s28 +; SI-NEXT: v_writelane_b32 v33, s97, 17 +; SI-NEXT: s_mov_b32 s57, s20 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_readfirstlane_b32 s36, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_readfirstlane_b32 s78, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_readfirstlane_b32 s79, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_writelane_b32 v33, s85, 18 +; SI-NEXT: v_readfirstlane_b32 s81, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_readfirstlane_b32 s89, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_readfirstlane_b32 s37, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_readfirstlane_b32 s70, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_readfirstlane_b32 s39, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_readfirstlane_b32 s90, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_readfirstlane_b32 s38, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_readfirstlane_b32 s30, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_readfirstlane_b32 s91, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_readfirstlane_b32 s46, v15 +; SI-NEXT: v_writelane_b32 v33, s57, 19 +; SI-NEXT: s_mov_b32 s88, s21 +; SI-NEXT: v_readfirstlane_b32 s94, v0 +; SI-NEXT: v_readfirstlane_b32 s96, v19 +; SI-NEXT: v_readfirstlane_b32 s47, v17 +; SI-NEXT: v_readfirstlane_b32 s87, v16 +; SI-NEXT: v_readfirstlane_b32 s98, v14 +; SI-NEXT: v_readfirstlane_b32 s92, v13 +; SI-NEXT: v_readfirstlane_b32 s83, v12 +; SI-NEXT: v_readfirstlane_b32 s25, v11 +; SI-NEXT: v_readfirstlane_b32 s82, v10 +; SI-NEXT: v_readfirstlane_b32 s68, v9 +; SI-NEXT: v_readfirstlane_b32 s80, v8 +; SI-NEXT: v_readfirstlane_b32 s29, v7 +; SI-NEXT: v_readfirstlane_b32 s71, v6 +; SI-NEXT: v_readfirstlane_b32 s17, v5 +; SI-NEXT: v_readfirstlane_b32 s49, v4 +; SI-NEXT: v_readfirstlane_b32 s77, v18 +; SI-NEXT: v_readfirstlane_b32 s84, v3 +; SI-NEXT: v_readfirstlane_b32 s27, v2 +; SI-NEXT: v_writelane_b32 v33, s46, 20 +; SI-NEXT: v_writelane_b32 v33, s47, 21 ; SI-NEXT: s_cbranch_scc0 .LBB111_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_mov_b32_e32 v37, v21 -; SI-NEXT: v_mov_b32_e32 v39, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s6 -; SI-NEXT: v_mov_b32_e32 v48, v23 -; SI-NEXT: v_mov_b32_e32 v49, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v8 -; SI-NEXT: v_mov_b32_e32 v55, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v10 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s42 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s43 -; SI-NEXT: v_mov_b32_e32 v50, v25 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 -; SI-NEXT: v_mov_b32_e32 v51, v26 -; SI-NEXT: v_mov_b32_e32 v52, v27 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v27 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 -; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v9 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v40, v32 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v32 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 -; SI-NEXT: v_mov_b32_e32 v41, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s21 -; SI-NEXT: v_mov_b32_e32 v42, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v13 -; SI-NEXT: v_mov_b32_e32 v43, v59 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v14 -; SI-NEXT: v_mov_b32_e32 v44, v63 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v63 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v46 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v16 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s23 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s14 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s15 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s40 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, s41 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_mov_b32_e32 v46, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v4 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 -; SI-NEXT: v_mov_b32_e32 v38, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 +; SI-NEXT: v_readlane_b32 s5, v33, 11 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s51, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v33, 10 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s53, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s88, 0xffff +; SI-NEXT: s_lshl_b32 s7, s55, 16 +; SI-NEXT: s_or_b32 s15, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v33, 9 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s65, 16 +; SI-NEXT: s_or_b32 s13, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v33, 8 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s67, 16 +; SI-NEXT: s_or_b32 s11, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v33, 6 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s76, 16 +; SI-NEXT: s_or_b32 s9, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v33, 4 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s48, 16 +; SI-NEXT: v_readlane_b32 s4, v33, 7 +; SI-NEXT: s_or_b32 s7, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v33, 13 +; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: v_readlane_b32 s4, v33, 5 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s44, s84, 16 +; SI-NEXT: v_readlane_b32 s16, v33, 2 +; SI-NEXT: v_writelane_b32 v33, s39, 34 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s30, 0xffff +; SI-NEXT: s_lshl_b32 s45, s49, 16 +; SI-NEXT: v_writelane_b32 v33, s38, 35 +; SI-NEXT: s_or_b32 vcc_hi, s44, s45 +; SI-NEXT: s_and_b32 s44, s16, 0xffff +; SI-NEXT: s_lshl_b32 s45, s71, 16 +; SI-NEXT: v_readlane_b32 s16, v33, 1 +; SI-NEXT: s_or_b32 s39, s44, s45 +; SI-NEXT: s_and_b32 s44, s16, 0xffff +; SI-NEXT: s_lshl_b32 s45, s80, 16 +; SI-NEXT: v_writelane_b32 v33, s49, 36 +; SI-NEXT: v_writelane_b32 v33, s48, 37 +; SI-NEXT: s_or_b32 s49, s44, s45 +; SI-NEXT: s_and_b32 s44, s90, 0xffff +; SI-NEXT: s_lshl_b32 s45, s82, 16 +; SI-NEXT: v_writelane_b32 v33, s29, 38 +; SI-NEXT: s_mov_b32 s69, s51 +; SI-NEXT: s_or_b32 s51, s44, s45 +; SI-NEXT: s_and_b32 s44, s70, 0xffff +; SI-NEXT: s_lshl_b32 s45, s83, 16 +; SI-NEXT: v_writelane_b32 v33, s27, 39 +; SI-NEXT: s_mov_b32 s34, s83 +; SI-NEXT: s_mov_b32 s83, s70 +; SI-NEXT: s_mov_b32 s70, s53 +; SI-NEXT: s_or_b32 s53, s44, s45 +; SI-NEXT: s_and_b32 s44, s89, 0xffff +; SI-NEXT: s_lshl_b32 s45, s98, 16 +; SI-NEXT: s_mov_b32 s86, s71 +; SI-NEXT: s_mov_b32 s71, s55 +; SI-NEXT: s_or_b32 s55, s44, s45 +; SI-NEXT: s_and_b32 s44, s79, 0xffff +; SI-NEXT: s_lshl_b32 s45, s87, 16 +; SI-NEXT: v_writelane_b32 v33, s92, 40 +; SI-NEXT: s_lshl_b32 s38, s17, 16 +; SI-NEXT: s_lshl_b32 s54, s92, 16 +; SI-NEXT: v_writelane_b32 v33, s25, 41 +; SI-NEXT: s_mov_b32 s92, s17 +; SI-NEXT: s_mov_b32 s17, s79 +; SI-NEXT: s_mov_b32 s79, s89 +; SI-NEXT: s_mov_b32 s89, s87 +; SI-NEXT: s_mov_b32 s87, s30 +; SI-NEXT: s_mov_b32 s30, s80 +; SI-NEXT: s_mov_b32 s80, s65 +; SI-NEXT: s_or_b32 s65, s44, s45 +; SI-NEXT: s_and_b32 s44, s36, 0xffff +; SI-NEXT: s_lshl_b32 s45, s96, 16 +; SI-NEXT: s_mov_b32 s16, s36 +; SI-NEXT: s_mov_b32 s36, s96 +; SI-NEXT: s_mov_b32 s96, s82 +; SI-NEXT: s_mov_b32 s82, s76 +; SI-NEXT: s_mov_b32 s76, s67 +; SI-NEXT: s_or_b32 s67, s44, s45 +; SI-NEXT: v_readlane_b32 s44, v33, 12 +; SI-NEXT: s_lshl_b32 s42, s99, 16 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_lshl_b32 s40, s93, 16 +; SI-NEXT: s_or_b32 s62, s44, s42 +; SI-NEXT: s_lshr_b64 s[18:19], s[42:43], 16 +; SI-NEXT: s_and_b32 s42, s56, 0xffff +; SI-NEXT: s_lshl_b32 s14, s95, 16 +; SI-NEXT: s_or_b32 s74, s42, s40 +; SI-NEXT: s_lshr_b64 s[20:21], s[40:41], 16 +; SI-NEXT: s_and_b32 s40, s57, 0xffff +; SI-NEXT: s_lshl_b32 s12, s31, 16 +; SI-NEXT: s_or_b32 s72, s40, s14 +; SI-NEXT: s_lshr_b64 s[22:23], s[14:15], 16 +; SI-NEXT: s_and_b32 s14, s58, 0xffff +; SI-NEXT: s_lshl_b32 s10, s35, 16 +; SI-NEXT: s_lshl_b32 s52, s25, 16 +; SI-NEXT: s_or_b32 s60, s14, s12 +; SI-NEXT: s_lshr_b64 s[24:25], s[12:13], 16 +; SI-NEXT: s_and_b32 s12, s59, 0xffff +; SI-NEXT: s_lshl_b32 s6, s4, 16 +; SI-NEXT: s_lshl_b32 s4, s27, 16 +; SI-NEXT: s_or_b32 s58, s12, s10 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_and_b32 s10, s97, 0xffff +; SI-NEXT: s_lshl_b32 s48, s29, 16 +; SI-NEXT: s_or_b32 s56, s10, s8 +; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_and_b32 s8, s85, 0xffff +; SI-NEXT: s_lshl_b32 s64, s46, 16 +; SI-NEXT: s_lshl_b32 s66, s47, 16 +; SI-NEXT: s_or_b32 s46, s8, s6 +; SI-NEXT: s_mov_b32 s47, s7 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v33, s6, 22 +; SI-NEXT: v_writelane_b32 v33, s7, 23 +; SI-NEXT: s_and_b32 s6, s94, 0xffff +; SI-NEXT: s_or_b32 s44, s6, s4 +; SI-NEXT: s_mov_b32 s45, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v33, s4, 24 +; SI-NEXT: s_lshl_b32 vcc_lo, s77, 16 +; SI-NEXT: v_writelane_b32 v33, s5, 25 +; SI-NEXT: s_and_b32 s4, s91, 0xffff +; SI-NEXT: s_or_b32 s42, s4, vcc_lo +; SI-NEXT: s_lshr_b64 s[4:5], vcc, 16 +; SI-NEXT: v_writelane_b32 v33, s4, 26 +; SI-NEXT: v_writelane_b32 v33, s5, 27 +; SI-NEXT: v_readlane_b32 s4, v33, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s40, s4, s38 +; SI-NEXT: s_lshr_b64 s[4:5], s[38:39], 16 +; SI-NEXT: v_writelane_b32 v33, s4, 28 +; SI-NEXT: v_writelane_b32 v33, s5, 29 +; SI-NEXT: v_readlane_b32 s38, v33, 35 +; SI-NEXT: s_and_b32 s4, s38, 0xffff +; SI-NEXT: s_or_b32 s14, s4, s48 +; SI-NEXT: s_lshr_b64 s[4:5], s[48:49], 16 +; SI-NEXT: s_mov_b32 s75, s41 +; SI-NEXT: s_mov_b32 s41, s39 +; SI-NEXT: v_readlane_b32 s39, v33, 34 +; SI-NEXT: v_writelane_b32 v33, s4, 30 +; SI-NEXT: v_writelane_b32 v33, s5, 31 +; SI-NEXT: v_readlane_b32 s4, v33, 0 +; SI-NEXT: s_lshl_b32 s50, s68, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s12, s4, s50 +; SI-NEXT: s_lshr_b64 s[4:5], s[50:51], 16 +; SI-NEXT: s_mov_b32 s73, s15 +; SI-NEXT: s_mov_b32 s15, s49 +; SI-NEXT: v_readlane_b32 s48, v33, 37 +; SI-NEXT: v_readlane_b32 s49, v33, 36 +; SI-NEXT: v_writelane_b32 v33, s4, 32 +; SI-NEXT: v_writelane_b32 v33, s5, 33 +; SI-NEXT: s_and_b32 s4, s39, 0xffff +; SI-NEXT: s_or_b32 s10, s4, s52 +; SI-NEXT: s_and_b32 s4, s37, 0xffff +; SI-NEXT: s_or_b32 s8, s4, s54 +; SI-NEXT: s_and_b32 s4, s81, 0xffff +; SI-NEXT: s_or_b32 s6, s4, s64 +; SI-NEXT: s_and_b32 s4, s78, 0xffff +; SI-NEXT: s_mov_b32 s59, s11 +; SI-NEXT: s_mov_b32 s57, s9 +; SI-NEXT: s_mov_b32 s11, s53 +; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 16 +; SI-NEXT: s_mov_b32 s9, s55 +; SI-NEXT: s_lshr_b64 s[54:55], s[54:55], 16 +; SI-NEXT: s_mov_b32 s7, s65 +; SI-NEXT: s_lshr_b64 s[64:65], s[64:65], 16 +; SI-NEXT: s_or_b32 s4, s4, s66 +; SI-NEXT: s_mov_b32 s5, s67 +; SI-NEXT: s_lshr_b64 s[66:67], s[66:67], 16 +; SI-NEXT: s_mov_b32 s63, s43 +; SI-NEXT: s_mov_b32 s61, s13 +; SI-NEXT: s_mov_b32 s43, vcc_hi +; SI-NEXT: s_mov_b32 s13, s51 +; SI-NEXT: s_mov_b32 s51, s69 +; SI-NEXT: v_readlane_b32 s29, v33, 38 +; SI-NEXT: s_mov_b32 s53, s70 +; SI-NEXT: s_mov_b32 s70, s83 +; SI-NEXT: s_mov_b32 s83, s34 +; SI-NEXT: v_readlane_b32 s27, v33, 39 +; SI-NEXT: s_mov_b32 s55, s71 +; SI-NEXT: s_mov_b32 s71, s86 +; SI-NEXT: s_mov_b32 s65, s80 +; SI-NEXT: s_mov_b32 s80, s30 +; SI-NEXT: s_mov_b32 s30, s87 +; SI-NEXT: s_mov_b32 s87, s89 +; SI-NEXT: s_mov_b32 s89, s79 +; SI-NEXT: s_mov_b32 s79, s17 +; SI-NEXT: s_mov_b32 s17, s92 +; SI-NEXT: v_readlane_b32 s25, v33, 41 +; SI-NEXT: v_readlane_b32 s92, v33, 40 +; SI-NEXT: s_mov_b32 s67, s76 +; SI-NEXT: s_mov_b32 s76, s82 +; SI-NEXT: s_mov_b32 s82, s96 +; SI-NEXT: s_mov_b32 s96, s36 +; SI-NEXT: s_mov_b32 s36, s16 ; SI-NEXT: s_cbranch_execnz .LBB111_3 ; SI-NEXT: .LBB111_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v37 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, s6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v48 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v50 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: s_add_i32 s14, s14, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, s18 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v50, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, s8 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v50, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, s9 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v50, s15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, s10 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v50, s26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v50, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s4, s78, 3 +; SI-NEXT: v_readlane_b32 s5, v33, 21 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s5, s36, 3 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s6, s96, 16 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s6, s81, 3 +; SI-NEXT: v_readlane_b32 s7, v33, 20 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s7, s79, 3 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s8, s87, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s8, s37, 3 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s9, s92, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_add_i32 s9, s89, 3 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s10, s98, 16 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_add_i32 s10, s39, 3 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s11, s25, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s11, s70, 3 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s12, s83, 16 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: v_readlane_b32 s12, v33, 0 ; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, s11 -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v48, s12 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v50, s27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v44 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v42 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v40 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v39 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s20 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v48, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s13 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s43 -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s13, s68, 16 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_add_i32 s13, s90, 3 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s14, s82, 16 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_add_i32 s14, s38, 3 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s15, s29, 16 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: v_readlane_b32 s15, v33, 1 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s16, s80, 16 +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: v_readlane_b32 s16, v33, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s40, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 2 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s71, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s41, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s91, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s77, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s42, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s30, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s49, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s43, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s94, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s27, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s44, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 13 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s84, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s45, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 18 +; SI-NEXT: s_add_i32 s28, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v33, 5 +; SI-NEXT: s_and_b32 s16, s28, 0xffff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s46, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 4 +; SI-NEXT: s_add_i32 s29, s16, 3 +; SI-NEXT: s_and_b32 s16, s29, 0xffff +; SI-NEXT: s_lshl_b32 s17, s48, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s47, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 17 +; SI-NEXT: s_add_i32 s26, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v33, 7 +; SI-NEXT: s_and_b32 s16, s26, 0xffff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s56, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 6 +; SI-NEXT: s_add_i32 s27, s16, 3 +; SI-NEXT: s_and_b32 s16, s27, 0xffff +; SI-NEXT: s_lshl_b32 s17, s76, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s57, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 16 +; SI-NEXT: s_add_i32 s24, s16, 3 +; SI-NEXT: s_and_b32 s16, s24, 0xffff +; SI-NEXT: s_lshl_b32 s17, s35, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s58, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 8 +; SI-NEXT: s_add_i32 s25, s16, 3 +; SI-NEXT: s_and_b32 s16, s25, 0xffff +; SI-NEXT: s_lshl_b32 s17, s67, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s59, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 15 +; SI-NEXT: s_add_i32 s22, s16, 3 +; SI-NEXT: s_and_b32 s16, s22, 0xffff +; SI-NEXT: s_lshl_b32 s17, s31, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s60, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 9 +; SI-NEXT: s_add_i32 s23, s16, 3 +; SI-NEXT: s_and_b32 s16, s23, 0xffff +; SI-NEXT: s_lshl_b32 s17, s65, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s61, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 19 +; SI-NEXT: s_add_i32 s20, s16, 3 +; SI-NEXT: s_and_b32 s16, s20, 0xffff +; SI-NEXT: s_lshl_b32 s17, s95, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s21, s88, 3 +; SI-NEXT: s_add_i32 s72, s16, 0x30000 +; SI-NEXT: s_and_b32 s16, s21, 0xffff +; SI-NEXT: s_lshl_b32 s17, s55, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s73, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 14 +; SI-NEXT: s_add_i32 s18, s16, 3 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_lshl_b32 s17, s93, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s74, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 10 +; SI-NEXT: s_add_i32 s19, s16, 3 +; SI-NEXT: s_and_b32 s16, s19, 0xffff +; SI-NEXT: s_lshl_b32 s17, s53, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s75, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 12 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s99, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s62, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v33, 11 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s51, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s63, s16, 0x30000 +; SI-NEXT: s_lshr_b64 s[16:17], s[46:47], 16 +; SI-NEXT: v_writelane_b32 v33, s16, 22 +; SI-NEXT: v_writelane_b32 v33, s17, 23 +; SI-NEXT: s_lshr_b64 s[16:17], s[44:45], 16 +; SI-NEXT: v_writelane_b32 v33, s16, 24 +; SI-NEXT: v_writelane_b32 v33, s17, 25 +; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 16 +; SI-NEXT: v_writelane_b32 v33, s16, 26 +; SI-NEXT: v_writelane_b32 v33, s17, 27 +; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 16 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: v_writelane_b32 v33, s16, 28 +; SI-NEXT: v_writelane_b32 v33, s17, 29 +; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 16 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: v_writelane_b32 v33, s16, 30 +; SI-NEXT: v_writelane_b32 v33, s17, 31 +; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[64:65], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[18:19], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[72:73], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[56:57], 16 +; SI-NEXT: v_writelane_b32 v33, s16, 32 +; SI-NEXT: s_lshr_b32 s51, s63, 16 +; SI-NEXT: s_lshr_b32 s53, s75, 16 +; SI-NEXT: s_lshr_b32 s55, s73, 16 +; SI-NEXT: s_lshr_b32 s65, s61, 16 +; SI-NEXT: s_lshr_b32 s67, s59, 16 +; SI-NEXT: s_lshr_b32 s76, s57, 16 +; SI-NEXT: s_lshr_b32 s48, s47, 16 +; SI-NEXT: s_lshr_b32 s84, s45, 16 +; SI-NEXT: s_lshr_b32 s49, s43, 16 +; SI-NEXT: s_lshr_b32 s71, s41, 16 +; SI-NEXT: s_lshr_b32 s80, s15, 16 +; SI-NEXT: s_lshr_b32 s82, s13, 16 +; SI-NEXT: s_lshr_b32 s83, s11, 16 +; SI-NEXT: s_lshr_b32 s98, s9, 16 +; SI-NEXT: s_lshr_b32 s87, s7, 16 +; SI-NEXT: s_lshr_b32 s96, s5, 16 +; SI-NEXT: v_writelane_b32 v33, s17, 33 ; SI-NEXT: .LBB111_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v34 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: v_or_b32_e32 v17, v20, v17 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v19, v57 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v35, v19 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v61 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_or_b32_e32 v20, v34, v20 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v35, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v45 -; SI-NEXT: v_or_b32_e32 v23, v35, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v58 -; SI-NEXT: v_or_b32_e32 v25, v35, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v62 -; SI-NEXT: v_or_b32_e32 v27, v35, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v63 -; SI-NEXT: v_or_b32_e32 v29, v35, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_or_b32_e32 v22, v34, v22 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_or_b32_e32 v24, v34, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v46 -; SI-NEXT: v_or_b32_e32 v26, v34, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v59 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v28, v34, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_or_b32_e32 v30, v32, v30 -; SI-NEXT: v_or_b32_e32 v31, v34, v31 +; SI-NEXT: s_and_b32 s16, s62, 0xffff +; SI-NEXT: s_lshl_b32 s17, s18, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s63, 0xffff +; SI-NEXT: s_lshl_b32 s18, s51, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s74, 0xffff +; SI-NEXT: s_lshl_b32 s19, s20, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s75, 0xffff +; SI-NEXT: s_lshl_b32 s20, s53, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s72, 0xffff +; SI-NEXT: s_lshl_b32 s21, s22, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_and_b32 s21, s73, 0xffff +; SI-NEXT: s_lshl_b32 s22, s55, 16 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_and_b32 s22, s60, 0xffff +; SI-NEXT: s_lshl_b32 s23, s24, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_and_b32 s23, s61, 0xffff +; SI-NEXT: s_lshl_b32 s24, s65, 16 +; SI-NEXT: s_or_b32 s23, s23, s24 +; SI-NEXT: s_and_b32 s24, s58, 0xffff +; SI-NEXT: s_lshl_b32 s25, s26, 16 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_and_b32 s25, s59, 0xffff +; SI-NEXT: s_lshl_b32 s26, s67, 16 +; SI-NEXT: s_or_b32 s25, s25, s26 +; SI-NEXT: s_and_b32 s26, s56, 0xffff +; SI-NEXT: s_lshl_b32 s27, s28, 16 +; SI-NEXT: s_or_b32 s26, s26, s27 +; SI-NEXT: s_and_b32 s27, s57, 0xffff +; SI-NEXT: s_lshl_b32 s28, s76, 16 +; SI-NEXT: v_readlane_b32 s56, v33, 22 +; SI-NEXT: s_or_b32 s27, s27, s28 +; SI-NEXT: s_and_b32 s28, s46, 0xffff +; SI-NEXT: s_lshl_b32 s29, s56, 16 +; SI-NEXT: s_or_b32 s28, s28, s29 +; SI-NEXT: s_and_b32 s29, s47, 0xffff +; SI-NEXT: s_lshl_b32 s46, s48, 16 +; SI-NEXT: s_or_b32 s29, s29, s46 +; SI-NEXT: v_readlane_b32 s46, v33, 24 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: v_readlane_b32 s47, v33, 25 +; SI-NEXT: s_or_b32 s44, s44, s46 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_lshl_b32 s46, s84, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: v_readlane_b32 s46, v33, 26 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: v_readlane_b32 s47, v33, 27 +; SI-NEXT: s_or_b32 s42, s42, s46 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s46, s49, 16 +; SI-NEXT: s_or_b32 s43, s43, s46 +; SI-NEXT: v_readlane_b32 s46, v33, 28 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: v_readlane_b32 s47, v33, 29 +; SI-NEXT: s_or_b32 s40, s40, s46 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s46, s71, 16 +; SI-NEXT: s_or_b32 s41, s41, s46 +; SI-NEXT: v_readlane_b32 s46, v33, 30 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: v_readlane_b32 s47, v33, 31 +; SI-NEXT: s_or_b32 s14, s14, s46 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s46, s80, 16 +; SI-NEXT: s_or_b32 s15, s15, s46 +; SI-NEXT: v_readlane_b32 s46, v33, 32 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_or_b32 s12, s12, s46 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s46, s82, 16 +; SI-NEXT: s_or_b32 s13, s13, s46 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s46, s52, 16 +; SI-NEXT: s_or_b32 s10, s10, s46 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s46, s83, 16 +; SI-NEXT: s_or_b32 s11, s11, s46 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s46, s54, 16 +; SI-NEXT: s_or_b32 s8, s8, s46 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s46, s98, 16 +; SI-NEXT: s_or_b32 s9, s9, s46 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s46, s64, 16 +; SI-NEXT: s_or_b32 s6, s6, s46 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s46, s87, 16 +; SI-NEXT: s_or_b32 s7, s7, s46 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s46, s66, 16 +; SI-NEXT: s_or_b32 s4, s4, s46 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s46, s96, 16 +; SI-NEXT: s_or_b32 s5, s5, s46 +; SI-NEXT: v_readlane_b32 s57, v33, 23 +; SI-NEXT: v_readlane_b32 s47, v33, 33 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s44 +; SI-NEXT: v_mov_b32_e32 v15, s45 +; SI-NEXT: v_mov_b32_e32 v16, s42 +; SI-NEXT: v_mov_b32_e32 v17, s43 +; SI-NEXT: v_mov_b32_e32 v18, s40 +; SI-NEXT: v_mov_b32_e32 v19, s41 +; SI-NEXT: v_mov_b32_e32 v20, s14 +; SI-NEXT: v_mov_b32_e32 v21, s15 +; SI-NEXT: v_mov_b32_e32 v22, s12 +; SI-NEXT: v_mov_b32_e32 v23, s13 +; SI-NEXT: v_mov_b32_e32 v24, s10 +; SI-NEXT: v_mov_b32_e32 v25, s11 +; SI-NEXT: v_mov_b32_e32 v26, s8 +; SI-NEXT: v_mov_b32_e32 v27, s9 +; SI-NEXT: v_mov_b32_e32 v28, s6 +; SI-NEXT: v_mov_b32_e32 v29, s7 +; SI-NEXT: v_mov_b32_e32 v30, s4 +; SI-NEXT: v_mov_b32_e32 v31, s5 +; SI-NEXT: v_readlane_b32 s99, v32, 35 +; SI-NEXT: v_readlane_b32 s98, v32, 34 +; SI-NEXT: v_readlane_b32 s97, v32, 33 +; SI-NEXT: v_readlane_b32 s96, v32, 32 +; SI-NEXT: v_readlane_b32 s87, v32, 31 +; SI-NEXT: v_readlane_b32 s86, v32, 30 +; SI-NEXT: v_readlane_b32 s85, v32, 29 +; SI-NEXT: v_readlane_b32 s84, v32, 28 +; SI-NEXT: v_readlane_b32 s83, v32, 27 +; SI-NEXT: v_readlane_b32 s82, v32, 26 +; SI-NEXT: v_readlane_b32 s81, v32, 25 +; SI-NEXT: v_readlane_b32 s80, v32, 24 +; SI-NEXT: v_readlane_b32 s71, v32, 23 +; SI-NEXT: v_readlane_b32 s70, v32, 22 +; SI-NEXT: v_readlane_b32 s69, v32, 21 +; SI-NEXT: v_readlane_b32 s68, v32, 20 +; SI-NEXT: v_readlane_b32 s67, v32, 19 +; SI-NEXT: v_readlane_b32 s66, v32, 18 +; SI-NEXT: v_readlane_b32 s65, v32, 17 +; SI-NEXT: v_readlane_b32 s64, v32, 16 +; SI-NEXT: v_readlane_b32 s55, v32, 15 +; SI-NEXT: v_readlane_b32 s54, v32, 14 +; SI-NEXT: v_readlane_b32 s53, v32, 13 +; SI-NEXT: v_readlane_b32 s52, v32, 12 +; SI-NEXT: v_readlane_b32 s51, v32, 11 +; SI-NEXT: v_readlane_b32 s50, v32, 10 +; SI-NEXT: v_readlane_b32 s49, v32, 9 +; SI-NEXT: v_readlane_b32 s48, v32, 8 +; SI-NEXT: v_readlane_b32 s39, v32, 7 +; SI-NEXT: v_readlane_b32 s38, v32, 6 +; SI-NEXT: v_readlane_b32 s37, v32, 5 +; SI-NEXT: v_readlane_b32 s36, v32, 4 +; SI-NEXT: v_readlane_b32 s35, v32, 3 +; SI-NEXT: v_readlane_b32 s34, v32, 2 +; SI-NEXT: v_readlane_b32 s31, v32, 1 +; SI-NEXT: v_readlane_b32 s30, v32, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB111_4: -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v37, v21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v44, v63 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v43, v59 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v42, v38 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v41, v36 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v40, v32 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v55, v31 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v52, v27 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v51, v26 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v50, v25 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v49, v24 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v48, v23 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v39, v22 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v38, v30 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; kill: killed $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v33, s4, 22 +; SI-NEXT: v_writelane_b32 v33, s5, 23 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: v_writelane_b32 v33, s4, 24 +; SI-NEXT: v_writelane_b32 v33, s5, 25 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v33, s4, 26 +; SI-NEXT: v_writelane_b32 v33, s5, 27 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v33, s4, 28 +; SI-NEXT: v_writelane_b32 v33, s5, 29 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v33, s4, 30 +; SI-NEXT: v_writelane_b32 v33, s5, 31 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v33, s4, 32 +; SI-NEXT: v_writelane_b32 v33, s5, 33 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB111_2 ; ; VI-LABEL: bitcast_v64i16_to_v64f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index 7351cff50f25f..67fb9a9e56a4e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -1634,72 +1634,42 @@ define <8 x half> @bitcast_v4i32_to_v8f16(<4 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i32_to_v8f16: @@ -1774,63 +1744,43 @@ define inreg <8 x half> @bitcast_v4i32_to_v8f16_scalar(<4 x i32> inreg %a, i32 i ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v0, v6, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s18, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s7, s4 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v4i32_to_v8f16_scalar: @@ -1917,26 +1867,14 @@ define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v4i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v6, v1 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1949,29 +1887,33 @@ define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -1980,15 +1922,15 @@ define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2083,43 +2025,31 @@ define inreg <4 x i32> @bitcast_v8f16_to_v4i32_scalar(<8 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v8f16_to_v4i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s18, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b32 s13, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v10, v0 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s12, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -2127,32 +2057,37 @@ define inreg <4 x i32> @bitcast_v8f16_to_v4i32_scalar(<8 x half> inreg %a, i32 i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v4i32_scalar: ; VI: ; %bb.0: @@ -6279,72 +6214,42 @@ define <8 x half> @bitcast_v4f32_to_v8f16(<4 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v8f16: @@ -6415,66 +6320,52 @@ define inreg <8 x half> @bitcast_v4f32_to_v8f16_scalar(<4 x float> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: s_cbranch_scc0 .LBB41_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 -; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: s_lshr_b32 s11, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB41_4 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v0, v6, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB41_5 +; SI-NEXT: .LBB41_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: s_branch .LBB41_2 +; SI-NEXT: .LBB41_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v7, s10 +; SI-NEXT: v_mov_b32_e32 v6, s11 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: .LBB41_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v8f16_scalar: ; VI: ; %bb.0: @@ -6562,26 +6453,14 @@ define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v4f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v6, v1 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -6594,29 +6473,33 @@ define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB42_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: .LBB42_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -6625,15 +6508,15 @@ define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -6728,43 +6611,31 @@ define inreg <4 x float> @bitcast_v8f16_to_v4f32_scalar(<8 x half> inreg %a, i32 ; SI-LABEL: bitcast_v8f16_to_v4f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s18, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b32 s13, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: s_cbranch_scc0 .LBB43_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v10, v0 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s12, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_cbranch_execnz .LBB43_4 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -6772,32 +6643,37 @@ define inreg <4 x float> @bitcast_v8f16_to_v4f32_scalar(<8 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB43_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: .LBB43_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; SI-NEXT: s_branch .LBB43_2 +; SI-NEXT: .LBB43_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v4f32_scalar: ; VI: ; %bb.0: @@ -10583,72 +10459,42 @@ define <8 x half> @bitcast_v2i64_to_v8f16(<2 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB60_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: .LBB60_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB60_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: .LBB60_4: ; %end +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i64_to_v8f16: @@ -10724,63 +10570,43 @@ define inreg <8 x half> @bitcast_v2i64_to_v8f16_scalar(<2 x i64> inreg %a, i32 i ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: s_lshr_b32 s7, s5, 16 -; SI-NEXT: s_add_u32 s8, s18, 3 -; SI-NEXT: s_addc_u32 s9, s19, 0 -; SI-NEXT: s_lshr_b32 s10, s8, 16 -; SI-NEXT: s_lshr_b32 s11, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: .LBB61_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v0, v6, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s18, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s7, s4 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB61_4: -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: s_branch .LBB61_2 ; ; VI-LABEL: bitcast_v2i64_to_v8f16_scalar: @@ -10867,26 +10693,14 @@ define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v2i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v6, v1 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -10899,29 +10713,33 @@ define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB62_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: .LBB62_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -10930,15 +10748,15 @@ define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -11033,43 +10851,31 @@ define inreg <2 x i64> @bitcast_v8f16_to_v2i64_scalar(<8 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v8f16_to_v2i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s18, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b32 s13, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: s_cbranch_scc0 .LBB63_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v10, v0 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s12, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_cbranch_execnz .LBB63_4 ; SI-NEXT: .LBB63_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -11077,32 +10883,37 @@ define inreg <2 x i64> @bitcast_v8f16_to_v2i64_scalar(<8 x half> inreg %a, i32 i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB63_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: .LBB63_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; SI-NEXT: s_branch .LBB63_2 +; SI-NEXT: .LBB63_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v2i64_scalar: ; VI: ; %bb.0: @@ -14490,68 +14301,40 @@ define <8 x half> @bitcast_v2f64_to_v8f16(<2 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB76_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: .LBB76_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB76_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: .LBB76_4: ; %end +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v8f16: @@ -14621,64 +14404,50 @@ define inreg <8 x half> @bitcast_v2f64_to_v8f16_scalar(<2 x double> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: s_cbranch_scc0 .LBB77_4 +; SI-NEXT: s_cbranch_scc0 .LBB77_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 -; SI-NEXT: s_cbranch_execnz .LBB77_3 +; SI-NEXT: s_lshr_b32 s11, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB77_4 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[5:6], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[0:1], s[18:19], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: .LBB77_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v0, v6, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB77_4: -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: s_branch .LBB77_5 +; SI-NEXT: .LBB77_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: s_branch .LBB77_2 +; SI-NEXT: .LBB77_4: +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: .LBB77_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v8f16_scalar: ; VI: ; %bb.0: @@ -14760,26 +14529,14 @@ define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v2f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v6, v1 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -14792,29 +14549,33 @@ define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB78_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_2 ; SI-NEXT: .LBB78_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -14823,15 +14584,15 @@ define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -14926,43 +14687,31 @@ define inreg <2 x double> @bitcast_v8f16_to_v2f64_scalar(<8 x half> inreg %a, i3 ; SI-LABEL: bitcast_v8f16_to_v2f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s18, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b32 s13, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: s_cbranch_scc0 .LBB79_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v10, v0 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s12, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_cbranch_execnz .LBB79_4 ; SI-NEXT: .LBB79_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -14970,32 +14719,37 @@ define inreg <2 x double> @bitcast_v8f16_to_v2f64_scalar(<8 x half> inreg %a, i3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB79_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: .LBB79_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; SI-NEXT: s_branch .LBB79_2 +; SI-NEXT: .LBB79_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v2f64_scalar: ; VI: ; %bb.0: @@ -17868,77 +17622,82 @@ define <8 x half> @bitcast_v8i16_to_v8f16(<8 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v8i16_to_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB88_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v8, v1, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v4, v1, v15 +; SI-NEXT: v_or_b32_e32 v9, v0, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_alignbit_b32 v10, v8, v12, 16 +; SI-NEXT: v_alignbit_b32 v11, v4, v14, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v14 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: .LBB88_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB88_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v13 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v14, v2 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_alignbit_b32 v10, v8, v9, 16 +; SI-NEXT: v_alignbit_b32 v11, v4, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: .LBB88_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i16_to_v8f16: @@ -18020,66 +17779,78 @@ define inreg <8 x half> @bitcast_v8i16_to_v8f16_scalar(<8 x i16> inreg %a, i32 i ; SI-LABEL: bitcast_v8i16_to_v8f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b32 s22, s18, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b32 s21, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s9 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s14, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s24, s21, 16 +; SI-NEXT: s_or_b32 s25, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 +; SI-NEXT: s_or_b32 s6, s4, s24 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s26, s22, 16 +; SI-NEXT: s_or_b32 s27, s5, s7 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16 +; SI-NEXT: s_mov_b32 s7, s25 +; SI-NEXT: s_mov_b32 s5, s27 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s22, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s6, s15, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s9 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s14, s7, 16 +; SI-NEXT: s_lshr_b32 s15, s5, 16 ; SI-NEXT: .LBB89_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s4, s4, s8 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s8, s15, 16 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB89_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: s_branch .LBB89_2 ; ; VI-LABEL: bitcast_v8i16_to_v8f16_scalar: @@ -18184,79 +17955,63 @@ define <8 x i16> @bitcast_v8f16_to_v8i16(<8 x half> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB90_2 ; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 ; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_alignbit_b32 v8, v2, v4, 16 -; SI-NEXT: v_alignbit_b32 v7, v6, v7, 16 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_alignbit_b32 v8, v1, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v7, 16 ; SI-NEXT: .LBB90_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v8i16: @@ -18339,81 +18094,75 @@ define inreg <8 x i16> @bitcast_v8f16_to_v8i16_scalar(<8 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v8f16_to_v8i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s9, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: s_cbranch_scc0 .LBB91_4 +; SI-NEXT: s_cbranch_scc0 .LBB91_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB91_3 +; SI-NEXT: s_cbranch_execnz .LBB91_4 ; SI-NEXT: .LBB91_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v9, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v10, v5, v0 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v4 ; SI-NEXT: v_lshr_b64 v[6:7], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: .LBB91_3: ; %end -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: s_branch .LBB91_5 +; SI-NEXT: .LBB91_3: +; SI-NEXT: s_branch .LBB91_2 +; SI-NEXT: .LBB91_4: +; SI-NEXT: v_mov_b32_e32 v8, s7 +; SI-NEXT: v_mov_b32_e32 v10, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: v_mov_b32_e32 v6, s9 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: .LBB91_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v0, v0, v5 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 ; SI-NEXT: v_or_b32_e32 v1, v1, v5 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB91_4: -; SI-NEXT: s_branch .LBB91_2 ; ; VI-LABEL: bitcast_v8f16_to_v8i16_scalar: ; VI: ; %bb.0: @@ -21514,107 +21263,91 @@ define <8 x bfloat> @bitcast_v8f16_to_v8bf16(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v8bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v5 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB100_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: .LBB100_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6 ; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v11 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v8 ; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -21698,99 +21431,93 @@ define inreg <8 x bfloat> @bitcast_v8f16_to_v8bf16_scalar(<8 x half> inreg %a, i ; SI-LABEL: bitcast_v8f16_to_v8bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v0 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: s_cbranch_scc0 .LBB101_4 +; SI-NEXT: s_cbranch_scc0 .LBB101_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v15 -; SI-NEXT: s_cbranch_execnz .LBB101_3 +; SI-NEXT: s_lshl_b32 s10, s16, 16 +; SI-NEXT: s_lshl_b32 s11, s6, 16 +; SI-NEXT: s_lshl_b32 s12, s17, 16 +; SI-NEXT: s_lshl_b32 s13, s7, 16 +; SI-NEXT: s_lshl_b32 s14, s18, 16 +; SI-NEXT: s_lshl_b32 s15, s8, 16 +; SI-NEXT: s_lshl_b32 s20, s19, 16 +; SI-NEXT: s_lshl_b32 s21, s9, 16 +; SI-NEXT: s_cbranch_execnz .LBB101_4 ; SI-NEXT: .LBB101_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s18 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: .LBB101_3: ; %end -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_branch .LBB101_5 +; SI-NEXT: .LBB101_3: +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: s_branch .LBB101_2 +; SI-NEXT: .LBB101_4: +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v6, s15 +; SI-NEXT: v_mov_b32_e32 v3, s14 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: .LBB101_5: ; %end +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[7:8], 16 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[4:5], 16 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB101_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: s_branch .LBB101_2 ; ; VI-LABEL: bitcast_v8f16_to_v8bf16_scalar: ; VI: ; %bb.0: @@ -21902,106 +21629,94 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v2 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_alignbit_b32 v5, v1, v9, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_alignbit_b32 v6, v5, v0, 16 +; SI-NEXT: v_alignbit_b32 v4, v3, v12, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v7, v4, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_alignbit_b32 v0, v0, v15, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v13, 16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: .LBB102_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v11 +; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v4, v3, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v5, v1, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v6, v5, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, v4, v7, 16 ; SI-NEXT: .LBB102_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -22364,93 +22079,79 @@ define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i ; SI-NEXT: s_and_b32 s10, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s11, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s11 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s10 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s9 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s8 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s7 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s6 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s5 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s7 ; SI-NEXT: s_cbranch_scc0 .LBB103_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshr_b64 v[5:6], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_lshr_b64 v[14:15], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshr_b64 v[18:19], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[11:12], 16 ; SI-NEXT: s_cbranch_execnz .LBB103_3 ; SI-NEXT: .LBB103_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshr_b64 v[9:10], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshr_b64 v[12:13], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshr_b64 v[3:4], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_lshr_b64 v[5:6], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 +; SI-NEXT: v_lshr_b64 v[14:15], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[2:3], 16 ; SI-NEXT: .LBB103_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB103_4: -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_branch .LBB103_2 ; ; VI-LABEL: bitcast_v8bf16_to_v8f16_scalar: @@ -22867,26 +22568,14 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v8f16_to_v16i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -22912,14 +22601,18 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB104_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v8, v20, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 -; SI-NEXT: v_or_b32_e32 v4, v16, v1 -; SI-NEXT: v_or_b32_e32 v12, v19, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v8, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 +; SI-NEXT: v_or_b32_e32 v12, v5, v7 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -22931,18 +22624,18 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 ; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB104_2 ; SI-NEXT: .LBB104_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -22953,7 +22646,7 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v8, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 ; SI-NEXT: v_or_b32_e32 v12, v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 @@ -23259,53 +22952,41 @@ define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v8f16_to_v16i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: s_lshr_b32 s28, s19, 16 +; SI-NEXT: s_lshr_b32 s29, s18, 16 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b32 s27, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: s_cbranch_scc0 .LBB105_4 +; SI-NEXT: s_cbranch_scc0 .LBB105_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_or_b32_e32 v19, v16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_or_b32_e32 v20, v8, v0 -; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; SI-NEXT: v_or_b32_e32 v17, v25, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; SI-NEXT: v_or_b32_e32 v18, v24, v1 -; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v18 -; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 -; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 -; SI-NEXT: v_lshr_b64 v[11:12], v[17:18], 24 -; SI-NEXT: v_lshr_b64 v[21:22], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[17:18], 8 -; SI-NEXT: s_cbranch_execnz .LBB105_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s26, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_and_b32 s7, s18, 0xffff +; SI-NEXT: s_lshl_b32 s9, s29, 16 +; SI-NEXT: s_or_b32 s12, s7, s9 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s9, s28, 16 +; SI-NEXT: s_or_b32 s13, s7, s9 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[12:13], 8 +; SI-NEXT: s_lshr_b32 s7, s5, 8 +; SI-NEXT: s_lshr_b32 s9, s13, 8 +; SI-NEXT: s_bfe_u32 s11, s26, 0x80008 +; SI-NEXT: s_bfe_u32 s15, s28, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB105_4 ; SI-NEXT: .LBB105_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -23316,12 +22997,12 @@ define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v17, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s27 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 ; SI-NEXT: v_or_b32_e32 v18, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -23335,37 +23016,55 @@ define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 i ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; SI-NEXT: v_or_b32_e32 v20, v2, v0 ; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[17:18], 24 ; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 ; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 -; SI-NEXT: v_lshr_b64 v[11:12], v[17:18], 24 -; SI-NEXT: v_lshr_b64 v[21:22], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[17:18], 16 ; SI-NEXT: v_lshr_b64 v[9:10], v[17:18], 8 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v20 ; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v18 ; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 ; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 -; SI-NEXT: .LBB105_3: ; %end +; SI-NEXT: s_branch .LBB105_5 +; SI-NEXT: .LBB105_3: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: s_branch .LBB105_2 +; SI-NEXT: .LBB105_4: +; SI-NEXT: v_mov_b32_e32 v14, s28 +; SI-NEXT: v_mov_b32_e32 v6, s26 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v20, s5 +; SI-NEXT: v_mov_b32_e32 v19, s4 +; SI-NEXT: v_mov_b32_e32 v18, s13 +; SI-NEXT: v_mov_b32_e32 v17, s12 +; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_mov_b32_e32 v13, s9 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v11, s14 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: .LBB105_5: ; %end ; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: v_mov_b32_e32 v0, v19 ; SI-NEXT: v_mov_b32_e32 v4, v20 ; SI-NEXT: v_mov_b32_e32 v8, v17 -; SI-NEXT: v_mov_b32_e32 v10, v21 +; SI-NEXT: v_mov_b32_e32 v10, v12 ; SI-NEXT: v_mov_b32_e32 v12, v18 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB105_4: -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: s_branch .LBB105_2 ; ; VI-LABEL: bitcast_v8f16_to_v16i8_scalar: ; VI: ; %bb.0: @@ -23623,50 +23322,58 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v15 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v9 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v16, v3 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_or_b32_e32 v3, v3, v21 +; SI-NEXT: v_or_b32_e32 v6, v20, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v9, v0, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v0, v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v0, v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v0, v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v5, v22, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v7, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v11, v3, v5, 16 +; SI-NEXT: v_or_b32_e32 v5, v0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -23675,77 +23382,82 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: .LBB106_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 ; SI-NEXT: v_or_b32_e32 v1, v23, v1 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v14 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v1, v22, v1 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v19, v1 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v18, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v0, v16, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v0 +; SI-NEXT: v_alignbit_b32 v7, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v11, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 ; SI-NEXT: .LBB106_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v8f16: @@ -24165,120 +23877,136 @@ define inreg <8 x half> @bitcast_v16i8_to_v8f16_scalar(<16 x i8> inreg %a, i32 i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: v_readfirstlane_b32 s15, v0 ; SI-NEXT: s_cbranch_scc0 .LBB107_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s19, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s23, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s40, s6, s5 +; SI-NEXT: s_or_b32 s6, s4, s40 ; SI-NEXT: s_and_b32 s4, s24, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s5, s27, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: s_lshl_b32 s5, s6, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_or_b32 s42, s7, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s7, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_lshr_b64 s[8:9], s[40:41], 16 +; SI-NEXT: s_and_b32 s5, s28, 0xff +; SI-NEXT: s_lshl_b32 s9, s29, 8 +; SI-NEXT: s_or_b32 s5, s5, s9 +; SI-NEXT: s_and_b32 s9, s15, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s10, s14, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_or_b32 s43, s5, s9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshr_b64 s[10:11], s[42:43], 16 +; SI-NEXT: s_or_b32 s4, s4, s42 +; SI-NEXT: s_lshr_b32 s11, s7, 16 +; SI-NEXT: s_lshr_b32 s9, s9, 16 +; SI-NEXT: s_mov_b32 s7, s41 +; SI-NEXT: s_mov_b32 s5, s43 ; SI-NEXT: s_cbranch_execnz .LBB107_3 ; SI-NEXT: .LBB107_2: ; %cmp.true -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s26, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s28, 0xff ; SI-NEXT: s_lshl_b32 s6, s29, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s26, 0xff -; SI-NEXT: s_lshl_b32 s7, s27, 8 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s7, s15, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s14, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_lshl_b32 s7, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s18, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s19, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s24, 0xff -; SI-NEXT: s_lshl_b32 s8, s25, 8 +; SI-NEXT: s_and_b32 s7, s20, 0xff +; SI-NEXT: s_lshl_b32 s8, s21, 8 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s22, 0xff -; SI-NEXT: s_lshl_b32 s9, s23, 8 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s20, 0xff -; SI-NEXT: s_lshl_b32 s10, s21, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s18, 0xff -; SI-NEXT: s_lshl_b32 s11, s19, 8 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_and_b32 s11, s16, 0xff -; SI-NEXT: s_lshl_b32 s12, s17, 8 -; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_and_b32 s9, s22, 0xff ; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: s_addk_i32 s8, 0x300 -; SI-NEXT: s_addk_i32 s9, 0x300 -; SI-NEXT: s_addk_i32 s10, 0x300 -; SI-NEXT: s_addk_i32 s11, 0x300 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshl_b32 s8, s23, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s11, s7, 16 +; SI-NEXT: s_lshr_b32 s9, s5, 16 ; SI-NEXT: .LBB107_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s8, s11, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s4, s4, s8 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s8, s9, 16 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB107_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: s_branch .LBB107_2 ; ; VI-LABEL: bitcast_v16i8_to_v8f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll index 8fbab2d6ab753..430a93d9e9bf0 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll @@ -995,86 +995,50 @@ define <10 x half> @bitcast_v5i32_to_v10f16(<5 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: .LBB8_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 +; SI-NEXT: v_alignbit_b32 v6, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v7, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB8_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_alignbit_b32 v6, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v7, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v2, v9, v2 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i32_to_v10f16: @@ -1152,76 +1116,51 @@ define inreg <10 x half> @bitcast_v5i32_to_v10f16_scalar(<5 x i32> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s21, 0 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s17, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s17, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 ; SI-NEXT: .LBB9_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v10, v1 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_or_b32_e32 v4, v7, v4 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s8, s13, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s18, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s8, s4 +; SI-NEXT: s_and_b32 s8, s19, 0xffff +; SI-NEXT: s_lshl_b32 s9, s12, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s20, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s9, s6 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_v5i32_to_v10f16_scalar: @@ -1314,31 +1253,16 @@ define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v10f16_to_v5i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v10, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v8, v1 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1351,33 +1275,38 @@ define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v14, v0 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v2, v10, v2 -; SI-NEXT: v_or_b32_e32 v3, v8, v3 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1385,10 +1314,10 @@ define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -1396,12 +1325,12 @@ define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -1502,50 +1431,35 @@ define inreg <5 x i32> @bitcast_v10f16_to_v5i32_scalar(<10 x half> inreg %a, i32 ; SI-LABEL: bitcast_v10f16_to_v5i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_lshr_b32 s9, s20, 16 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s18, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b32 s15, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s21, 0 -; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v13, v0 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_or_b32_e32 v2, v9, v2 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s14, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s13, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s12, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s10, s9, 16 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_cbranch_execnz .LBB11_4 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1553,10 +1467,10 @@ define inreg <5 x i32> @bitcast_v10f16_to_v5i32_scalar(<10 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s13 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -1564,29 +1478,35 @@ define inreg <5 x i32> @bitcast_v10f16_to_v5i32_scalar(<10 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB11_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 +; SI-NEXT: .LBB11_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 ; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f16_to_v5i32_scalar: ; VI: ; %bb.0: @@ -2309,94 +2229,58 @@ define <10 x half> @bitcast_v5f32_to_v10f16(<5 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 +; SI-NEXT: v_alignbit_b32 v6, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v7, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v6, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v7, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v5, v0, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v2, v9, v2 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v5f32_to_v10f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f32_to_v10f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 @@ -2462,79 +2346,61 @@ define inreg <10 x half> @bitcast_v5f32_to_v10f16_scalar(<5 x float> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s21, 0 -; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: s_cbranch_scc0 .LBB17_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 -; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB17_4 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v9, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 ; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 ; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v10, v1 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_or_b32_e32 v4, v7, v4 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[7:8], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[4:5], 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: s_branch .LBB17_5 +; SI-NEXT: .LBB17_3: +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB17_2 +; SI-NEXT: .LBB17_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: .LBB17_5: ; %end +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f32_to_v10f16_scalar: ; VI: ; %bb.0: @@ -2636,31 +2502,16 @@ define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v10f16_to_v5f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v10, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v8, v1 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -2673,33 +2524,38 @@ define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v14, v0 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v2, v10, v2 -; SI-NEXT: v_or_b32_e32 v3, v8, v3 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2707,10 +2563,10 @@ define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2718,12 +2574,12 @@ define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2824,50 +2680,35 @@ define inreg <5 x float> @bitcast_v10f16_to_v5f32_scalar(<10 x half> inreg %a, i ; SI-LABEL: bitcast_v10f16_to_v5f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_lshr_b32 s9, s20, 16 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s18, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b32 s15, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s21, 0 -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v13, v0 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_or_b32_e32 v2, v9, v2 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s14, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s13, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s12, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s10, s9, 16 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2875,10 +2716,10 @@ define inreg <5 x float> @bitcast_v10f16_to_v5f32_scalar(<10 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s13 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2886,29 +2727,35 @@ define inreg <5 x float> @bitcast_v10f16_to_v5f32_scalar(<10 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f16_to_v5f32_scalar: ; VI: ; %bb.0: @@ -3025,92 +2872,97 @@ define <10 x half> @bitcast_v10i16_to_v10f16(<10 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v10i16_to_v10f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v6 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v10, v1, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v11, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v5, v1, v18 +; SI-NEXT: v_or_b32_e32 v9, v0, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_alignbit_b32 v12, v10, v17, 16 +; SI-NEXT: v_alignbit_b32 v13, v5, v19, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v15 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_alignbit_b32 v12, v10, v11, 16 +; SI-NEXT: v_alignbit_b32 v13, v5, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 ; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i16_to_v10f16: @@ -3197,79 +3049,93 @@ define inreg <10 x half> @bitcast_v10i16_to_v10f16_scalar(<10 x i16> inreg %a, i ; SI-LABEL: bitcast_v10i16_to_v10f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s14, s20, 16 +; SI-NEXT: s_lshr_b32 s22, s19, 16 +; SI-NEXT: s_lshr_b32 s24, s18, 16 +; SI-NEXT: s_lshr_b32 s15, s17, 16 +; SI-NEXT: s_lshr_b32 s23, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s21, 0 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: s_and_b32 s5, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s14, 16 +; SI-NEXT: s_or_b32 s21, s5, s7 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s26, s23, 16 +; SI-NEXT: s_or_b32 s27, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s22, 16 +; SI-NEXT: s_or_b32 s6, s4, s26 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s28, s24, 16 +; SI-NEXT: s_or_b32 s29, s5, s7 +; SI-NEXT: s_or_b32 s4, s4, s28 +; SI-NEXT: s_lshr_b64 s[8:9], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[28:29], 16 +; SI-NEXT: s_mov_b32 s7, s27 +; SI-NEXT: s_mov_b32 s5, s29 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true -; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s24, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s6, s22, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s23, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s8, s15, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s14, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s21, s8, 0x30000 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s15, s7, 16 +; SI-NEXT: s_lshr_b32 s22, s5, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 ; SI-NEXT: .LBB21_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v1, v1, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s8, s15, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s8, s10, 16 +; SI-NEXT: s_or_b32 s4, s4, s8 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s8, s22, 16 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s21, 0xffff +; SI-NEXT: s_lshl_b32 s9, s14, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s5 +; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr21 ; SI-NEXT: s_branch .LBB21_2 ; ; VI-LABEL: bitcast_v10i16_to_v10f16_scalar: @@ -3389,95 +3255,75 @@ define <10 x i16> @bitcast_v10f16_to_v10i16(<10 x half> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_or_b32_e32 v6, v6, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 ; SI-NEXT: v_or_b32_e32 v0, v0, v5 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_alignbit_b32 v10, v2, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v8, v9, 16 +; SI-NEXT: v_or_b32_e32 v2, v2, v9 +; SI-NEXT: v_alignbit_b32 v10, v1, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v3, v9, 16 ; SI-NEXT: .LBB22_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v5 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 ; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f16_to_v10i16: @@ -3565,97 +3411,89 @@ define inreg <10 x i16> @bitcast_v10f16_to_v10i16_scalar(<10 x half> inreg %a, i ; SI-LABEL: bitcast_v10f16_to_v10i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: s_lshr_b32 s8, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s9, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s10, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s21, 0 -; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: s_cbranch_scc0 .LBB23_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: s_cbranch_execnz .LBB23_4 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v12, v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_or_b32_e32 v9, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_or_b32_e32 v11, v6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s20 +; SI-NEXT: v_or_b32_e32 v10, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_or_b32_e32 v3, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: v_lshr_b64 v[6:7], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 ; SI-NEXT: v_or_b32_e32 v5, v14, v13 -; SI-NEXT: .LBB23_3: ; %end -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_branch .LBB23_5 +; SI-NEXT: .LBB23_3: +; SI-NEXT: s_branch .LBB23_2 +; SI-NEXT: .LBB23_4: +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v11, s7 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s20 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v4, s9 +; SI-NEXT: .LBB23_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB23_4: -; SI-NEXT: s_branch .LBB23_2 ; ; VI-LABEL: bitcast_v10f16_to_v10i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll index ed44b1c0b294a..b6b321a08f7aa 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll @@ -10,22 +10,15 @@ define half @bitcast_i16_to_f16(i16 %a, i32 %b) { ; SI-LABEL: bitcast_i16_to_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i16_to_f16: @@ -114,16 +107,13 @@ define inreg half @bitcast_i16_to_f16_scalar(i16 inreg %a, i32 inreg %b) { ; SI-NEXT: s_cmp_lg_u32 s17, 0 ; SI-NEXT: s_cbranch_scc0 .LBB1_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 ; SI-NEXT: s_cbranch_execnz .LBB1_3 ; SI-NEXT: .LBB1_2: ; %cmp.true ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 ; SI-NEXT: .LBB1_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB1_4: -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_branch .LBB1_2 ; ; VI-LABEL: bitcast_i16_to_f16_scalar: @@ -195,17 +185,27 @@ define i16 @bitcast_f16_to_i16(half %a, i32 %b) { ; SI-LABEL: bitcast_f16_to_i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB2_3 +; SI-NEXT: ; %bb.1: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_cbranch_execnz .LBB2_4 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB2_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: .LBB2_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: ; %bb.2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -291,20 +291,22 @@ define inreg i16 @bitcast_f16_to_i16_scalar(half inreg %a, i32 inreg %b) { ; SI-LABEL: bitcast_f16_to_i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: s_and_b32 s6, s16, 0xffff +; SI-NEXT: s_cbranch_execnz .LBB3_4 ; SI-NEXT: .LBB3_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: .LBB3_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB3_4: +; SI-NEXT: .LBB3_3: +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f16_to_i16_scalar: ; VI: ; %bb.0: @@ -838,26 +840,24 @@ define bfloat @bitcast_f16_to_bf16(half %a, i32 %b) { ; SI-LABEL: bitcast_f16_to_bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB8_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -943,25 +943,26 @@ define inreg bfloat @bitcast_f16_to_bf16_scalar(half inreg %a, i32 inreg %b) { ; SI-LABEL: bitcast_f16_to_bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: s_cbranch_scc0 .LBB9_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB9_4 ; SI-NEXT: .LBB9_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; SI-NEXT: .LBB9_3: ; %end -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_branch .LBB9_5 +; SI-NEXT: .LBB9_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB9_2 +; SI-NEXT: .LBB9_4: +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: .LBB9_5: ; %end +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB9_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_f16_to_bf16_scalar: ; VI: ; %bb.0: @@ -1059,20 +1060,23 @@ define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_bf16_to_f16: @@ -1196,22 +1200,19 @@ define inreg half @bitcast_bf16_to_f16_scalar(bfloat inreg %a, i32 inreg %b) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_lshl_b32 s4, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; SI-NEXT: s_cbranch_execnz .LBB11_3 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SI-NEXT: .LBB11_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB11_4: -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_branch .LBB11_2 ; ; VI-LABEL: bitcast_bf16_to_f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll index 94ccde5a0a948..d463b115d1088 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll @@ -1890,100 +1890,57 @@ define <12 x half> @bitcast_v6i32_to_v12f16(<6 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i32_to_v12f16: @@ -2064,89 +2021,59 @@ define inreg <12 x half> @bitcast_v6i32_to_v12f16_scalar(<6 x i32> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v2, v8, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_and_b32 s8, s19, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s20, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s9, s4 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v6i32_to_v12f16_scalar: @@ -2244,36 +2171,18 @@ define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v12f16_to_v6i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v7, v4 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -2286,37 +2195,43 @@ define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 -; SI-NEXT: v_or_b32_e32 v2, v13, v2 -; SI-NEXT: v_or_b32_e32 v3, v11, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2325,31 +2240,31 @@ define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -2454,57 +2369,39 @@ define inreg <6 x i32> @bitcast_v12f16_to_v6i32_scalar(<12 x half> inreg %a, i32 ; SI-LABEL: bitcast_v12f16_to_v6i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b32 s24, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v16, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v4, v8, v4 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s24, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s23, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2513,47 +2410,54 @@ define inreg <6 x i32> @bitcast_v12f16_to_v6i32_scalar(<12 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f16_to_v6i32_scalar: ; VI: ; %bb.0: @@ -4162,100 +4066,57 @@ define <12 x half> @bitcast_v6f32_to_v12f16(<6 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: .LBB32_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f32_to_v12f16: @@ -4331,92 +4192,70 @@ define inreg <12 x half> @bitcast_v6f32_to_v12f16_scalar(<6 x float> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: s_cbranch_scc0 .LBB33_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB33_4 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v11, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 ; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s21, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v2, v8, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[6:7], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[7:8], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: s_branch .LBB33_5 +; SI-NEXT: .LBB33_3: +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: s_branch .LBB33_2 +; SI-NEXT: .LBB33_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v11, s12 +; SI-NEXT: v_mov_b32_e32 v10, s13 +; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: .LBB33_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v6f32_to_v12f16_scalar: ; VI: ; %bb.0: @@ -4522,36 +4361,18 @@ define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v12f16_to_v6f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v7, v4 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4564,37 +4385,43 @@ define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB34_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 -; SI-NEXT: v_or_b32_e32 v2, v13, v2 -; SI-NEXT: v_or_b32_e32 v3, v11, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: .LBB34_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4603,31 +4430,31 @@ define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -4732,57 +4559,39 @@ define inreg <6 x float> @bitcast_v12f16_to_v6f32_scalar(<12 x half> inreg %a, i ; SI-LABEL: bitcast_v12f16_to_v6f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b32 s24, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: s_cbranch_scc0 .LBB35_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v16, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v4, v8, v4 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s24, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s23, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_cbranch_execnz .LBB35_4 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4791,47 +4600,54 @@ define inreg <6 x float> @bitcast_v12f16_to_v6f32_scalar(<12 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB35_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: .LBB35_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 ; SI-NEXT: s_branch .LBB35_2 +; SI-NEXT: .LBB35_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f16_to_v6f32_scalar: ; VI: ; %bb.0: @@ -6009,47 +5825,22 @@ define <12 x half> @bitcast_v3i64_to_v12f16(<3 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 ; SI-NEXT: ; %bb.3: ; %cmp.true @@ -6059,50 +5850,32 @@ define <12 x half> @bitcast_v3i64_to_v12f16(<3 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i64_to_v12f16: @@ -6185,89 +5958,59 @@ define inreg <12 x half> @bitcast_v3i64_to_v12f16_scalar(<3 x i64> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: s_lshr_b32 s7, s5, 16 -; SI-NEXT: s_add_u32 s8, s18, 3 -; SI-NEXT: s_addc_u32 s9, s19, 0 -; SI-NEXT: s_lshr_b32 s10, s8, 16 -; SI-NEXT: s_lshr_b32 s11, s9, 16 -; SI-NEXT: s_add_u32 s12, s20, 3 -; SI-NEXT: s_addc_u32 s13, s21, 0 -; SI-NEXT: s_lshr_b32 s14, s12, 16 -; SI-NEXT: s_lshr_b32 s15, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s6 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v10, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v2, v8, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v4, v7, v4 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_and_b32 s8, s19, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s20, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s9, s4 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v3i64_to_v12f16_scalar: @@ -6365,36 +6108,18 @@ define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v12f16_to_v3i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v7, v4 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -6407,37 +6132,43 @@ define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB46_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 -; SI-NEXT: v_or_b32_e32 v2, v13, v2 -; SI-NEXT: v_or_b32_e32 v3, v11, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -6446,31 +6177,31 @@ define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -6575,57 +6306,39 @@ define inreg <3 x i64> @bitcast_v12f16_to_v3i64_scalar(<12 x half> inreg %a, i32 ; SI-LABEL: bitcast_v12f16_to_v3i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b32 s24, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: s_cbranch_scc0 .LBB47_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v16, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v4, v8, v4 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s24, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s23, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_cbranch_execnz .LBB47_4 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -6634,47 +6347,54 @@ define inreg <3 x i64> @bitcast_v12f16_to_v3i64_scalar(<12 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB47_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: .LBB47_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 ; SI-NEXT: s_branch .LBB47_2 +; SI-NEXT: .LBB47_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f16_to_v3i64_scalar: ; VI: ; %bb.0: @@ -7447,94 +7167,54 @@ define <12 x half> @bitcast_v3f64_to_v12f16(<3 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: .LBB52_4: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_alignbit_b32 v6, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f64_to_v12f16: @@ -7607,89 +7287,67 @@ define inreg <12 x half> @bitcast_v3f64_to_v12f16_scalar(<3 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: s_cbranch_scc0 .LBB53_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB53_4 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[3:4], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[11:12], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[9:10], s[18:19], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v2, v8, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[6:7], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[7:8], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: s_branch .LBB53_5 +; SI-NEXT: .LBB53_3: +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: s_branch .LBB53_2 +; SI-NEXT: .LBB53_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v11, s13 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: .LBB53_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v3f64_to_v12f16_scalar: ; VI: ; %bb.0: @@ -7786,36 +7444,18 @@ define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v12f16_to_v3f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v7, v4 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -7828,37 +7468,43 @@ define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB54_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 -; SI-NEXT: v_or_b32_e32 v2, v13, v2 -; SI-NEXT: v_or_b32_e32 v3, v11, v3 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: .LBB54_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -7867,31 +7513,31 @@ define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -7996,57 +7642,39 @@ define inreg <3 x double> @bitcast_v12f16_to_v3f64_scalar(<12 x half> inreg %a, ; SI-LABEL: bitcast_v12f16_to_v3f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b32 s24, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: s_cbranch_scc0 .LBB55_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v16, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v4, v8, v4 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s24, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s23, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s15, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s13, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_cbranch_execnz .LBB55_4 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -8055,47 +7683,54 @@ define inreg <3 x double> @bitcast_v12f16_to_v3f64_scalar(<12 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB55_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: .LBB55_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 ; SI-NEXT: s_branch .LBB55_2 +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f16_to_v3f64_scalar: ; VI: ; %bb.0: @@ -8222,107 +7857,114 @@ define <12 x half> @bitcast_v12i16_to_v12f16(<12 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v12i16_to_v12f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v10 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v13, v1, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v11, v1, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v16, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v6, v1, v22 +; SI-NEXT: v_or_b32_e32 v12, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_alignbit_b32 v14, v13, v18, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v6, v23, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v23 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v4, v23, v4 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v2, v20, v2 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v4, v22, v4 +; SI-NEXT: v_or_b32_e32 v2, v21, v2 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_alignbit_b32 v14, v13, v16, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v17, v6, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i16_to_v12f16: @@ -8414,92 +8056,110 @@ define inreg <12 x half> @bitcast_v12i16_to_v12f16_scalar(<12 x i16> inreg %a, i ; SI-LABEL: bitcast_v12i16_to_v12f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s26, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s20, 16 +; SI-NEXT: s_lshr_b32 s25, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s18, 16 +; SI-NEXT: s_lshr_b32 s24, s17, 16 +; SI-NEXT: s_lshr_b32 s27, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s24, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s40, s27, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s25, 16 +; SI-NEXT: s_or_b32 s8, s4, s40 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s42, s28, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s26, 16 +; SI-NEXT: s_or_b32 s6, s4, s42 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s44, s29, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_or_b32 s4, s4, s44 +; SI-NEXT: s_lshr_b64 s[10:11], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[44:45], 16 +; SI-NEXT: s_mov_b32 s9, s41 +; SI-NEXT: s_mov_b32 s7, s43 +; SI-NEXT: s_mov_b32 s5, s45 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s6, s26, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s25, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s16, 0xffff +; SI-NEXT: s_lshl_b32 s9, s27, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s17, 0xffff +; SI-NEXT: s_lshl_b32 s10, s24, 16 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s24, s9, 16 +; SI-NEXT: s_lshr_b32 s25, s7, 16 +; SI-NEXT: s_lshr_b32 s26, s5, 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v1, v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v2, v2, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v4, v4, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s10, s24, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 +; SI-NEXT: s_or_b32 s6, s6, s10 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s10, s25, 16 +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s10, s14, 16 +; SI-NEXT: s_or_b32 s4, s4, s10 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s10, s26, 16 +; SI-NEXT: s_or_b32 s5, s5, s10 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v12i16_to_v12f16_scalar: @@ -8627,113 +8287,89 @@ define <12 x i16> @bitcast_v12f16_to_v12i16(<12 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v12f16_to_v12i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_or_b32_e32 v7, v7, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_or_b32_e32 v9, v9, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v2, v2, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v12 ; SI-NEXT: v_or_b32_e32 v0, v0, v6 -; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v11 ; SI-NEXT: v_or_b32_e32 v4, v4, v10 -; SI-NEXT: v_alignbit_b32 v12, v2, v6, 16 -; SI-NEXT: v_alignbit_b32 v11, v9, v11, 16 -; SI-NEXT: v_alignbit_b32 v10, v7, v10, 16 +; SI-NEXT: v_alignbit_b32 v12, v1, v6, 16 +; SI-NEXT: v_alignbit_b32 v11, v3, v11, 16 +; SI-NEXT: v_alignbit_b32 v10, v5, v10, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v6 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 ; SI-NEXT: v_or_b32_e32 v2, v2, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 ; SI-NEXT: v_or_b32_e32 v4, v4, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f16_to_v12i16: @@ -8826,114 +8462,104 @@ define inreg <12 x i16> @bitcast_v12f16_to_v12i16_scalar(<12 x half> inreg %a, i ; SI-LABEL: bitcast_v12f16_to_v12i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s9, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: s_cbranch_scc0 .LBB59_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: s_cbranch_execnz .LBB59_4 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v16, v2, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v15, v7, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 +; SI-NEXT: v_or_b32_e32 v12, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v14, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v5, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: v_or_b32_e32 v3, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v6, v1 ; SI-NEXT: v_lshr_b64 v[10:11], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16 ; SI-NEXT: v_lshr_b64 v[6:7], v[4:5], 16 ; SI-NEXT: v_or_b32_e32 v4, v17, v4 -; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_branch .LBB59_5 +; SI-NEXT: .LBB59_3: +; SI-NEXT: s_branch .LBB59_2 +; SI-NEXT: .LBB59_4: +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v16, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v10, s11 +; SI-NEXT: v_mov_b32_e32 v8, s10 +; SI-NEXT: v_mov_b32_e32 v6, s9 +; SI-NEXT: .LBB59_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v2, v2, v7 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 ; SI-NEXT: v_or_b32_e32 v4, v4, v6 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 ; SI-NEXT: v_or_b32_e32 v3, v3, v7 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v12f16_to_v12i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll index cd5f3490a69e9..e0fac42ac9d77 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll @@ -1165,114 +1165,64 @@ define <14 x half> @bitcast_v7i32_to_v14f16(<7 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: .LBB8_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v7, v0, v6, 16 +; SI-NEXT: v_alignbit_b32 v8, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v9, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v11, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB8_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_alignbit_b32 v8, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v9, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v11, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v7, v0, v6, 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i32_to_v14f16: @@ -1357,102 +1307,67 @@ define inreg <14 x half> @bitcast_v7i32_to_v14f16_scalar(<7 x i32> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s23, 0 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 ; SI-NEXT: .LBB9_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v9, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: v_or_b32_e32 v4, v11, v4 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s10, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s19, 0xffff +; SI-NEXT: s_lshl_b32 s10, s15, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s20, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s10, s4 +; SI-NEXT: s_and_b32 s10, s21, 0xffff +; SI-NEXT: s_lshl_b32 s11, s14, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s22, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s11, s6 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v6, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_v7i32_to_v14f16_scalar: @@ -1556,41 +1471,20 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v14f16_to_v7i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 +; SI-NEXT: v_mov_b32_e32 v14, v6 +; SI-NEXT: v_mov_b32_e32 v8, v5 +; SI-NEXT: v_mov_b32_e32 v9, v4 +; SI-NEXT: v_mov_b32_e32 v10, v3 +; SI-NEXT: v_mov_b32_e32 v11, v2 +; SI-NEXT: v_mov_b32_e32 v12, v1 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1603,20 +1497,34 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v20, v0 -; SI-NEXT: v_or_b32_e32 v1, v18, v1 -; SI-NEXT: v_or_b32_e32 v2, v16, v2 -; SI-NEXT: v_or_b32_e32 v3, v14, v3 -; SI-NEXT: v_or_b32_e32 v4, v12, v4 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 @@ -1624,20 +1532,13 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -1646,25 +1547,25 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -1672,12 +1573,12 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -1789,64 +1690,43 @@ define inreg <7 x i32> @bitcast_v14f16_to_v7i32_scalar(<14 x half> inreg %a, i32 ; SI-LABEL: bitcast_v14f16_to_v7i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 16 +; SI-NEXT: s_lshr_b32 s25, s18, 16 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b32 s27, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s23, 0 -; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v2, v15, v2 -; SI-NEXT: v_or_b32_e32 v3, v13, v3 -; SI-NEXT: v_or_b32_e32 v4, v11, v4 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s26, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s25, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s24, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s15, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s14, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s12, s11, 16 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_cbranch_execnz .LBB11_4 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -1855,25 +1735,25 @@ define inreg <7 x i32> @bitcast_v14f16_to_v7i32_scalar(<14 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s25 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -1881,29 +1761,37 @@ define inreg <7 x i32> @bitcast_v14f16_to_v7i32_scalar(<14 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB11_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 +; SI-NEXT: .LBB11_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 ; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f16_to_v7i32_scalar: ; VI: ; %bb.0: @@ -2766,114 +2654,64 @@ define <14 x half> @bitcast_v7f32_to_v14f16(<7 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v7, v0, v6, 16 +; SI-NEXT: v_alignbit_b32 v8, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v9, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v11, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v0 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v8, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v9, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v11, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v7, v0, v6, 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f32_to_v14f16: @@ -2952,105 +2790,79 @@ define inreg <14 x half> @bitcast_v7f32_to_v14f16_scalar(<7 x float> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s23, 0 -; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: s_cbranch_scc0 .LBB17_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 -; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: s_lshr_b32 s23, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB17_4 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v13, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 ; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s22, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v13 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v9, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: v_or_b32_e32 v4, v11, v4 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[7:8], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[10:11], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: s_branch .LBB17_5 +; SI-NEXT: .LBB17_3: +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB17_2 +; SI-NEXT: .LBB17_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_mov_b32_e32 v13, s15 +; SI-NEXT: v_mov_b32_e32 v12, s23 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: .LBB17_5: ; %end +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v7f32_to_v14f16_scalar: ; VI: ; %bb.0: @@ -3160,41 +2972,20 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v14f16_to_v7f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 +; SI-NEXT: v_mov_b32_e32 v14, v6 +; SI-NEXT: v_mov_b32_e32 v8, v5 +; SI-NEXT: v_mov_b32_e32 v9, v4 +; SI-NEXT: v_mov_b32_e32 v10, v3 +; SI-NEXT: v_mov_b32_e32 v11, v2 +; SI-NEXT: v_mov_b32_e32 v12, v1 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3207,20 +2998,34 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v20, v0 -; SI-NEXT: v_or_b32_e32 v1, v18, v1 -; SI-NEXT: v_or_b32_e32 v2, v16, v2 -; SI-NEXT: v_or_b32_e32 v3, v14, v3 -; SI-NEXT: v_or_b32_e32 v4, v12, v4 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 @@ -3228,20 +3033,13 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -3250,25 +3048,25 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -3276,12 +3074,12 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -3393,64 +3191,43 @@ define inreg <7 x float> @bitcast_v14f16_to_v7f32_scalar(<14 x half> inreg %a, i ; SI-LABEL: bitcast_v14f16_to_v7f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 16 +; SI-NEXT: s_lshr_b32 s25, s18, 16 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b32 s27, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s23, 0 -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v2, v15, v2 -; SI-NEXT: v_or_b32_e32 v3, v13, v3 -; SI-NEXT: v_or_b32_e32 v4, v11, v4 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s26, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s25, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s24, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s15, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s14, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s12, s11, 16 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -3459,25 +3236,25 @@ define inreg <7 x float> @bitcast_v14f16_to_v7f32_scalar(<14 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s25 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -3485,29 +3262,37 @@ define inreg <7 x float> @bitcast_v14f16_to_v7f32_scalar(<14 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f16_to_v7f32_scalar: ; VI: ; %bb.0: @@ -3645,46 +3430,52 @@ define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v14i16_to_v14f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v8 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v15, v1, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v16, v0, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v13, v1, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v14, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v7, v1, v26 +; SI-NEXT: v_or_b32_e32 v11, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_alignbit_b32 v17, v15, v22, 16 +; SI-NEXT: v_alignbit_b32 v18, v13, v24, 16 +; SI-NEXT: v_alignbit_b32 v19, v7, v27, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v21 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -3692,75 +3483,76 @@ define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v23 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v4, v27, v4 +; SI-NEXT: v_or_b32_e32 v2, v24, v2 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v4, v26, v4 +; SI-NEXT: v_or_b32_e32 v2, v25, v2 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_alignbit_b32 v17, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v18, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v19, v7, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v20 ; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v18 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i16_to_v14f16: @@ -3858,105 +3650,125 @@ define inreg <14 x half> @bitcast_v14i16_to_v14f16_scalar(<14 x i16> inreg %a, i ; SI-LABEL: bitcast_v14i16_to_v14f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s26, s22, 16 +; SI-NEXT: s_lshr_b32 s29, s21, 16 +; SI-NEXT: s_lshr_b32 s42, s20, 16 +; SI-NEXT: s_lshr_b32 s28, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s27, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s23, 0 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 +; SI-NEXT: s_and_b32 s5, s22, 0xffff +; SI-NEXT: s_lshl_b32 s7, s26, 16 +; SI-NEXT: s_or_b32 s23, s5, s7 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s27, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s44, s40, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 +; SI-NEXT: s_or_b32 s8, s4, s44 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s46, s41, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s29, 16 +; SI-NEXT: s_or_b32 s6, s4, s46 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s56, s42, 16 +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_or_b32 s4, s4, s56 +; SI-NEXT: s_lshr_b64 s[10:11], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[56:57], 16 +; SI-NEXT: s_mov_b32 s9, s45 +; SI-NEXT: s_mov_b32 s7, s47 +; SI-NEXT: s_mov_b32 s5, s57 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true -; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s6, s29, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s41, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s28, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s16, 0xffff +; SI-NEXT: s_lshl_b32 s9, s40, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s17, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s26, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s23, s10, 0x30000 +; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s27, s9, 16 +; SI-NEXT: s_lshr_b32 s28, s7, 16 +; SI-NEXT: s_lshr_b32 s29, s5, 16 +; SI-NEXT: s_lshr_b32 s26, s23, 16 ; SI-NEXT: .LBB21_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v2, v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v1, v1, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s10, s12, 16 +; SI-NEXT: s_or_b32 s6, s6, s10 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s10, s28, 16 +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s10, s14, 16 +; SI-NEXT: s_or_b32 s4, s4, s10 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s10, s29, 16 +; SI-NEXT: s_or_b32 s5, s5, s10 +; SI-NEXT: s_and_b32 s10, s23, 0xffff +; SI-NEXT: s_lshl_b32 s11, s26, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr23 ; SI-NEXT: s_branch .LBB21_2 ; ; VI-LABEL: bitcast_v14i16_to_v14f16_scalar: @@ -4094,128 +3906,100 @@ define <14 x i16> @bitcast_v14f16_to_v14i16(<14 x half> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v9, v9, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v10 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 ; SI-NEXT: v_or_b32_e32 v0, v0, v7 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v13 ; SI-NEXT: v_or_b32_e32 v4, v4, v12 -; SI-NEXT: v_alignbit_b32 v14, v2, v7, 16 -; SI-NEXT: v_alignbit_b32 v13, v11, v13, 16 -; SI-NEXT: v_alignbit_b32 v12, v8, v12, 16 +; SI-NEXT: v_alignbit_b32 v14, v1, v7, 16 +; SI-NEXT: v_alignbit_b32 v13, v3, v13, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v12, 16 ; SI-NEXT: .LBB22_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v7 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 ; SI-NEXT: v_or_b32_e32 v2, v2, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 ; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f16_to_v14i16: @@ -4314,130 +4098,118 @@ define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i ; SI-LABEL: bitcast_v14f16_to_v14i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 +; SI-NEXT: s_lshr_b32 s10, s22, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s9, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s12, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s23, 0 -; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: s_cbranch_scc0 .LBB23_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: s_cbranch_execnz .LBB23_4 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v17, v2, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v12 -; SI-NEXT: v_or_b32_e32 v18, v7, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v16, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_or_b32_e32 v13, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_or_b32_e32 v14, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_or_b32_e32 v16, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_or_b32_e32 v5, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v17 ; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v18 ; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: v_lshr_b64 v[10:11], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16 ; SI-NEXT: v_lshr_b64 v[6:7], v[4:5], 16 ; SI-NEXT: v_or_b32_e32 v4, v19, v4 -; SI-NEXT: .LBB23_3: ; %end -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_branch .LBB23_5 +; SI-NEXT: .LBB23_3: +; SI-NEXT: s_branch .LBB23_2 +; SI-NEXT: .LBB23_4: +; SI-NEXT: v_mov_b32_e32 v12, s10 +; SI-NEXT: v_mov_b32_e32 v15, s8 +; SI-NEXT: v_mov_b32_e32 v17, s7 +; SI-NEXT: v_mov_b32_e32 v18, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: v_mov_b32_e32 v10, s12 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v6, s9 +; SI-NEXT: .LBB23_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v2, v2, v7 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v17 ; SI-NEXT: v_or_b32_e32 v4, v4, v6 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 ; SI-NEXT: v_or_b32_e32 v3, v3, v7 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB23_4: -; SI-NEXT: s_branch .LBB23_2 ; ; VI-LABEL: bitcast_v14f16_to_v14i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index 075216fc4791c..dd6846e7d0537 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -2149,128 +2149,71 @@ define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i32_to_v16f16: @@ -2358,115 +2301,75 @@ define inreg <16 x half> @bitcast_v8i32_to_v16f16_scalar(<8 x i32> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s21, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s21, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_or_b32_e32 v0, v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v5, v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v16, v1 -; SI-NEXT: v_or_b32_e32 v3, v14, v3 -; SI-NEXT: v_or_b32_e32 v4, v11, v4 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s10, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s19, 0xffff +; SI-NEXT: s_lshl_b32 s10, s24, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s20, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s10, s6 +; SI-NEXT: s_and_b32 s10, s21, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s22, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s11, s4 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v8i32_to_v16f16_scalar: @@ -2575,46 +2478,22 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v8i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v9, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v11, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v1 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -2627,22 +2506,38 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 -; SI-NEXT: v_or_b32_e32 v3, v17, v3 -; SI-NEXT: v_or_b32_e32 v4, v15, v4 -; SI-NEXT: v_or_b32_e32 v5, v13, v5 -; SI-NEXT: v_or_b32_e32 v6, v11, v6 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -2651,21 +2546,13 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -2678,10 +2565,10 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -2690,18 +2577,18 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -2709,11 +2596,11 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -2830,71 +2717,47 @@ define inreg <8 x i32> @bitcast_v16f16_to_v8i32_scalar(<16 x half> inreg %a, i32 ; SI-LABEL: bitcast_v16f16_to_v8i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s22, 16 +; SI-NEXT: s_lshr_b32 s25, s21, 16 +; SI-NEXT: s_lshr_b32 s26, s20, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s18, 16 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_or_b32_e32 v2, v18, v2 -; SI-NEXT: v_or_b32_e32 v3, v16, v3 -; SI-NEXT: v_or_b32_e32 v4, v14, v4 -; SI-NEXT: v_or_b32_e32 v5, v12, v5 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s29, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s27, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s25, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2902,10 +2765,10 @@ define inreg <8 x i32> @bitcast_v16f16_to_v8i32_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2913,11 +2776,11 @@ define inreg <8 x i32> @bitcast_v16f16_to_v8i32_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s26 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2925,11 +2788,11 @@ define inreg <8 x i32> @bitcast_v16f16_to_v8i32_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -2937,29 +2800,38 @@ define inreg <8 x i32> @bitcast_v16f16_to_v8i32_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s15 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v8i32_scalar: ; VI: ; %bb.0: @@ -9512,128 +9384,71 @@ define <16 x half> @bitcast_v8f32_to_v16f16(<8 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v16f16: @@ -9714,118 +9529,88 @@ define inreg <16 x half> @bitcast_v8f32_to_v16f16_scalar(<8 x float> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: s_cbranch_scc0 .LBB41_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 -; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: s_lshr_b32 s25, s23, 16 +; SI-NEXT: s_lshr_b32 s24, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB41_4 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 ; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_lshr_b64 v[8:9], v[6:7], 16 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[9:10], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[10:11], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_or_b32_e32 v0, v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v5, v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v16, v1 -; SI-NEXT: v_or_b32_e32 v3, v14, v3 -; SI-NEXT: v_or_b32_e32 v4, v11, v4 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: s_branch .LBB41_5 +; SI-NEXT: .LBB41_3: +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: s_branch .LBB41_2 +; SI-NEXT: .LBB41_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v15, s14 +; SI-NEXT: v_mov_b32_e32 v14, s15 +; SI-NEXT: v_mov_b32_e32 v13, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s6 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: .LBB41_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v13 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v8f32_to_v16f16_scalar: ; VI: ; %bb.0: @@ -9939,46 +9724,22 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v8f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v9, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v11, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v1 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -9991,22 +9752,38 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB42_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 -; SI-NEXT: v_or_b32_e32 v3, v17, v3 -; SI-NEXT: v_or_b32_e32 v4, v15, v4 -; SI-NEXT: v_or_b32_e32 v5, v13, v5 -; SI-NEXT: v_or_b32_e32 v6, v11, v6 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -10015,21 +9792,13 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: .LBB42_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -10042,10 +9811,10 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -10054,18 +9823,18 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -10073,11 +9842,11 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -10194,71 +9963,47 @@ define inreg <8 x float> @bitcast_v16f16_to_v8f32_scalar(<16 x half> inreg %a, i ; SI-LABEL: bitcast_v16f16_to_v8f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s22, 16 +; SI-NEXT: s_lshr_b32 s25, s21, 16 +; SI-NEXT: s_lshr_b32 s26, s20, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s18, 16 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: s_cbranch_scc0 .LBB43_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_or_b32_e32 v2, v18, v2 -; SI-NEXT: v_or_b32_e32 v3, v16, v3 -; SI-NEXT: v_or_b32_e32 v4, v14, v4 -; SI-NEXT: v_or_b32_e32 v5, v12, v5 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s29, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s27, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s25, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_cbranch_execnz .LBB43_4 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -10266,10 +10011,10 @@ define inreg <8 x float> @bitcast_v16f16_to_v8f32_scalar(<16 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -10277,11 +10022,11 @@ define inreg <8 x float> @bitcast_v16f16_to_v8f32_scalar(<16 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s26 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -10289,11 +10034,11 @@ define inreg <8 x float> @bitcast_v16f16_to_v8f32_scalar(<16 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -10301,29 +10046,38 @@ define inreg <8 x float> @bitcast_v16f16_to_v8f32_scalar(<16 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s15 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB43_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: .LBB43_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 ; SI-NEXT: s_branch .LBB43_2 +; SI-NEXT: .LBB43_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v8f32_scalar: ; VI: ; %bb.0: @@ -16462,59 +16216,26 @@ define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB60_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: .LBB60_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB60_4 ; SI-NEXT: ; %bb.3: ; %cmp.true @@ -16526,64 +16247,40 @@ define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; SI-NEXT: .LBB60_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i64_to_v16f16: @@ -16673,115 +16370,75 @@ define inreg <16 x half> @bitcast_v4i64_to_v16f16_scalar(<4 x i64> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s21, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: s_lshr_b32 s7, s5, 16 -; SI-NEXT: s_add_u32 s8, s18, 3 -; SI-NEXT: s_addc_u32 s9, s19, 0 -; SI-NEXT: s_lshr_b32 s10, s8, 16 -; SI-NEXT: s_lshr_b32 s11, s9, 16 -; SI-NEXT: s_add_u32 s12, s20, 3 -; SI-NEXT: s_addc_u32 s13, s21, 0 -; SI-NEXT: s_lshr_b32 s14, s12, 16 -; SI-NEXT: s_lshr_b32 s15, s13, 16 -; SI-NEXT: s_add_u32 s16, s22, 3 -; SI-NEXT: s_addc_u32 s17, s23, 0 -; SI-NEXT: s_lshr_b32 s18, s16, 16 -; SI-NEXT: s_lshr_b32 s19, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s21, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 ; SI-NEXT: .LBB61_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_or_b32_e32 v0, v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v5, v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v16, v1 -; SI-NEXT: v_or_b32_e32 v3, v14, v3 -; SI-NEXT: v_or_b32_e32 v4, v11, v4 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s10, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s19, 0xffff +; SI-NEXT: s_lshl_b32 s10, s24, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s20, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s10, s6 +; SI-NEXT: s_and_b32 s10, s21, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s22, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s11, s4 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB61_4: -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: s_branch .LBB61_2 ; ; VI-LABEL: bitcast_v4i64_to_v16f16_scalar: @@ -16890,46 +16547,22 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v4i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v9, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v11, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v1 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -16942,22 +16575,38 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB62_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 -; SI-NEXT: v_or_b32_e32 v3, v17, v3 -; SI-NEXT: v_or_b32_e32 v4, v15, v4 -; SI-NEXT: v_or_b32_e32 v5, v13, v5 -; SI-NEXT: v_or_b32_e32 v6, v11, v6 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -16966,21 +16615,13 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: .LBB62_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -16993,10 +16634,10 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -17005,18 +16646,18 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -17024,11 +16665,11 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -17145,71 +16786,47 @@ define inreg <4 x i64> @bitcast_v16f16_to_v4i64_scalar(<16 x half> inreg %a, i32 ; SI-LABEL: bitcast_v16f16_to_v4i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s22, 16 +; SI-NEXT: s_lshr_b32 s25, s21, 16 +; SI-NEXT: s_lshr_b32 s26, s20, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s18, 16 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: s_cbranch_scc0 .LBB63_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_or_b32_e32 v2, v18, v2 -; SI-NEXT: v_or_b32_e32 v3, v16, v3 -; SI-NEXT: v_or_b32_e32 v4, v14, v4 -; SI-NEXT: v_or_b32_e32 v5, v12, v5 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s29, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s27, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s25, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_cbranch_execnz .LBB63_4 ; SI-NEXT: .LBB63_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -17217,10 +16834,10 @@ define inreg <4 x i64> @bitcast_v16f16_to_v4i64_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -17228,11 +16845,11 @@ define inreg <4 x i64> @bitcast_v16f16_to_v4i64_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s26 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -17240,11 +16857,11 @@ define inreg <4 x i64> @bitcast_v16f16_to_v4i64_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -17252,29 +16869,38 @@ define inreg <4 x i64> @bitcast_v16f16_to_v4i64_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s15 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB63_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: .LBB63_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 ; SI-NEXT: s_branch .LBB63_2 +; SI-NEXT: .LBB63_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v4i64_scalar: ; VI: ; %bb.0: @@ -22903,120 +22529,67 @@ define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB76_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: .LBB76_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; SI-NEXT: .LBB76_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v16f16: @@ -23092,114 +22665,84 @@ define inreg <16 x half> @bitcast_v4f64_to_v16f16_scalar(<4 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: s_cbranch_scc0 .LBB77_4 +; SI-NEXT: s_cbranch_scc0 .LBB77_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 -; SI-NEXT: s_cbranch_execnz .LBB77_3 +; SI-NEXT: s_lshr_b32 s25, s23, 16 +; SI-NEXT: s_lshr_b32 s24, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB77_4 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[0:1], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[5:6], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[8:9], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[9:10], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: .LBB77_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_or_b32_e32 v0, v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v5, v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v16, v1 -; SI-NEXT: v_or_b32_e32 v3, v14, v3 -; SI-NEXT: v_or_b32_e32 v4, v11, v4 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; SI-NEXT: s_branch .LBB77_5 +; SI-NEXT: .LBB77_3: +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: s_branch .LBB77_2 +; SI-NEXT: .LBB77_4: +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v14, s24 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v12, s14 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v9, s6 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: .LBB77_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v15 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB77_4: -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: s_branch .LBB77_2 ; ; VI-LABEL: bitcast_v4f64_to_v16f16_scalar: ; VI: ; %bb.0: @@ -23301,46 +22844,22 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v4f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v9, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v11, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v1 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -23353,22 +22872,38 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB78_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 -; SI-NEXT: v_or_b32_e32 v3, v17, v3 -; SI-NEXT: v_or_b32_e32 v4, v15, v4 -; SI-NEXT: v_or_b32_e32 v5, v13, v5 -; SI-NEXT: v_or_b32_e32 v6, v11, v6 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -23377,21 +22912,13 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_2 ; SI-NEXT: .LBB78_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -23404,10 +22931,10 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -23416,18 +22943,18 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -23435,11 +22962,11 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -23556,71 +23083,47 @@ define inreg <4 x double> @bitcast_v16f16_to_v4f64_scalar(<16 x half> inreg %a, ; SI-LABEL: bitcast_v16f16_to_v4f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s22, 16 +; SI-NEXT: s_lshr_b32 s25, s21, 16 +; SI-NEXT: s_lshr_b32 s26, s20, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s18, 16 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: s_cbranch_scc0 .LBB79_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_or_b32_e32 v2, v18, v2 -; SI-NEXT: v_or_b32_e32 v3, v16, v3 -; SI-NEXT: v_or_b32_e32 v4, v14, v4 -; SI-NEXT: v_or_b32_e32 v5, v12, v5 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s29, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s28, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s27, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s25, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s15, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_cbranch_execnz .LBB79_4 ; SI-NEXT: .LBB79_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -23628,10 +23131,10 @@ define inreg <4 x double> @bitcast_v16f16_to_v4f64_scalar(<16 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -23639,11 +23142,11 @@ define inreg <4 x double> @bitcast_v16f16_to_v4f64_scalar(<16 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s26 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -23651,11 +23154,11 @@ define inreg <4 x double> @bitcast_v16f16_to_v4f64_scalar(<16 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -23663,29 +23166,38 @@ define inreg <4 x double> @bitcast_v16f16_to_v4f64_scalar(<16 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s15 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB79_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: .LBB79_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 ; SI-NEXT: s_branch .LBB79_2 +; SI-NEXT: .LBB79_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v4f64_scalar: ; VI: ; %bb.0: @@ -28561,51 +28073,59 @@ define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v16i16_to_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v12 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB88_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v8, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v17, v1, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v15, v1, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v22, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v13, v1, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v18, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v8, v1, v31 +; SI-NEXT: v_or_b32_e32 v16, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_alignbit_b32 v19, v17, v24, 16 +; SI-NEXT: v_alignbit_b32 v20, v15, v26, 16 +; SI-NEXT: v_alignbit_b32 v21, v13, v28, 16 +; SI-NEXT: v_alignbit_b32 v23, v8, v30, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v30 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -28614,84 +28134,85 @@ define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: .LBB88_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB88_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v30 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v28 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v24 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v4, v28, v4 +; SI-NEXT: v_or_b32_e32 v2, v26, v2 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v31, v6 +; SI-NEXT: v_or_b32_e32 v4, v29, v4 +; SI-NEXT: v_or_b32_e32 v2, v27, v2 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v19, v17, v22, 16 +; SI-NEXT: v_alignbit_b32 v20, v15, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v13, v16, 16 +; SI-NEXT: v_alignbit_b32 v23, v8, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 ; SI-NEXT: .LBB88_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v19 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v16 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i16_to_v16f16: @@ -28794,118 +28315,142 @@ define inreg <16 x half> @bitcast_v16i16_to_v16f16_scalar(<16 x i16> inreg %a, i ; SI-LABEL: bitcast_v16i16_to_v16f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s43, s23, 16 +; SI-NEXT: s_lshr_b32 s47, s22, 16 +; SI-NEXT: s_lshr_b32 s42, s21, 16 +; SI-NEXT: s_lshr_b32 s46, s20, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s18, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b32 s44, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s40, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s56, s44, 16 +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s41, 16 +; SI-NEXT: s_or_b32 s10, s4, s56 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s58, s45, 16 +; SI-NEXT: s_or_b32 s59, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s42, 16 +; SI-NEXT: s_or_b32 s8, s4, s58 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s60, s46, 16 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s43, 16 +; SI-NEXT: s_or_b32 s6, s4, s60 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s62, s47, 16 +; SI-NEXT: s_or_b32 s63, s5, s7 +; SI-NEXT: s_or_b32 s4, s4, s62 +; SI-NEXT: s_lshr_b64 s[12:13], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[62:63], 16 +; SI-NEXT: s_mov_b32 s11, s57 +; SI-NEXT: s_mov_b32 s9, s59 +; SI-NEXT: s_mov_b32 s7, s61 +; SI-NEXT: s_mov_b32 s5, s63 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s6, s43, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s46, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s21, 0xffff +; SI-NEXT: s_lshl_b32 s8, s42, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s18, 0xffff +; SI-NEXT: s_lshl_b32 s9, s45, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s19, 0xffff +; SI-NEXT: s_lshl_b32 s10, s41, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s16, 0xffff +; SI-NEXT: s_lshl_b32 s11, s44, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s17, 0xffff +; SI-NEXT: s_lshl_b32 s12, s40, 16 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_lshr_b64 s[12:13], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s40, s11, 16 +; SI-NEXT: s_lshr_b32 s41, s9, 16 +; SI-NEXT: s_lshr_b32 s42, s7, 16 +; SI-NEXT: s_lshr_b32 s43, s5, 16 ; SI-NEXT: .LBB89_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v2, v2, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v3, v3, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v4, v4, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s12, s40, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s8, s8, s12 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s12, s41, 16 +; SI-NEXT: s_or_b32 s9, s9, s12 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s12, s24, 16 +; SI-NEXT: s_or_b32 s6, s6, s12 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s12, s42, 16 +; SI-NEXT: s_or_b32 s7, s7, s12 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s12, s26, 16 +; SI-NEXT: s_or_b32 s4, s4, s12 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s12, s43, 16 +; SI-NEXT: s_or_b32 s5, s5, s12 +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: v_mov_b32_e32 v7, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB89_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB89_2 ; ; VI-LABEL: bitcast_v16i16_to_v16f16_scalar: @@ -29051,146 +28596,114 @@ define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v16i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v11 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB90_2 ; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v10, v10, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_or_b32_e32 v6, v6, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_or_b32_e32 v12, v12, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v7, v7, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v12 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 ; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v15 ; SI-NEXT: v_or_b32_e32 v4, v4, v14 -; SI-NEXT: v_or_b32_e32 v9, v9, v13 -; SI-NEXT: v_alignbit_b32 v16, v2, v8, 16 -; SI-NEXT: v_alignbit_b32 v15, v12, v15, 16 -; SI-NEXT: v_alignbit_b32 v14, v6, v14, 16 -; SI-NEXT: v_alignbit_b32 v13, v10, v13, 16 +; SI-NEXT: v_or_b32_e32 v6, v6, v13 +; SI-NEXT: v_alignbit_b32 v16, v1, v8, 16 +; SI-NEXT: v_alignbit_b32 v15, v3, v15, 16 +; SI-NEXT: v_alignbit_b32 v14, v5, v14, 16 +; SI-NEXT: v_alignbit_b32 v13, v7, v13, 16 ; SI-NEXT: .LBB90_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v15 ; SI-NEXT: v_or_b32_e32 v2, v2, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v8 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v8 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 ; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v16i16: @@ -29294,147 +28807,133 @@ define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i ; SI-LABEL: bitcast_v16f16_to_v16i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s22, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s11, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s13, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: s_cbranch_scc0 .LBB91_4 +; SI-NEXT: s_cbranch_scc0 .LBB91_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB91_3 +; SI-NEXT: s_cbranch_execnz .LBB91_4 ; SI-NEXT: .LBB91_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_or_b32_e32 v17, v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v16, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s11 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v23, v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v20, v9, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v22, v8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_or_b32_e32 v19, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_or_b32_e32 v18, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s23 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v20, v1, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v7 +; SI-NEXT: v_or_b32_e32 v7, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_or_b32_e32 v5, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: v_lshr_b64 v[10:11], v[4:5], 16 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 ; SI-NEXT: v_or_b32_e32 v3, v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 -; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 ; SI-NEXT: v_or_b32_e32 v1, v1, v8 ; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[12:13], v[2:3], 16 -; SI-NEXT: v_lshr_b64 v[10:11], v[4:5], 16 ; SI-NEXT: v_lshr_b64 v[8:9], v[6:7], 16 -; SI-NEXT: .LBB91_3: ; %end +; SI-NEXT: s_branch .LBB91_5 +; SI-NEXT: .LBB91_3: +; SI-NEXT: s_branch .LBB91_2 +; SI-NEXT: .LBB91_4: +; SI-NEXT: v_mov_b32_e32 v17, s10 +; SI-NEXT: v_mov_b32_e32 v21, s8 +; SI-NEXT: v_mov_b32_e32 v22, s7 +; SI-NEXT: v_mov_b32_e32 v23, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v20, s22 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v14, s13 +; SI-NEXT: v_mov_b32_e32 v12, s12 +; SI-NEXT: v_mov_b32_e32 v10, s11 +; SI-NEXT: v_mov_b32_e32 v8, s9 +; SI-NEXT: .LBB91_5: ; %end ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 ; SI-NEXT: v_or_b32_e32 v4, v4, v6 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v6, v6, v8 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB91_4: -; SI-NEXT: s_branch .LBB91_2 ; ; VI-LABEL: bitcast_v16f16_to_v16i16_scalar: ; VI: ; %bb.0: @@ -34904,91 +34403,59 @@ define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v16bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v9 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB100_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -35001,103 +34468,103 @@ define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB100_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: .LBB100_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v10 ; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v17 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v12 ; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v19 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v14 ; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v21 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v16 ; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v22 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v18 ; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v23 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v20 ; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -35202,183 +34669,169 @@ define inreg <16 x bfloat> @bitcast_v16f16_to_v16bf16_scalar(<16 x half> inreg % ; SI-LABEL: bitcast_v16f16_to_v16bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v0 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: s_cbranch_scc0 .LBB101_4 +; SI-NEXT: s_cbranch_scc0 .LBB101_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; SI-NEXT: s_cbranch_execnz .LBB101_3 +; SI-NEXT: s_lshl_b32 s13, s16, 16 +; SI-NEXT: s_lshl_b32 s15, s6, 16 +; SI-NEXT: s_lshl_b32 s24, s17, 16 +; SI-NEXT: s_lshl_b32 s25, s7, 16 +; SI-NEXT: s_lshl_b32 s26, s18, 16 +; SI-NEXT: s_lshl_b32 s27, s8, 16 +; SI-NEXT: s_lshl_b32 s28, s19, 16 +; SI-NEXT: s_lshl_b32 s29, s9, 16 +; SI-NEXT: s_lshl_b32 s40, s20, 16 +; SI-NEXT: s_lshl_b32 s41, s10, 16 +; SI-NEXT: s_lshl_b32 s42, s21, 16 +; SI-NEXT: s_lshl_b32 s43, s11, 16 +; SI-NEXT: s_lshl_b32 s44, s22, 16 +; SI-NEXT: s_lshl_b32 s45, s12, 16 +; SI-NEXT: s_lshl_b32 s46, s23, 16 +; SI-NEXT: s_lshl_b32 s47, s14, 16 +; SI-NEXT: s_cbranch_execnz .LBB101_4 ; SI-NEXT: .LBB101_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s22 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s21 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s20 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v11 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: .LBB101_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_branch .LBB101_5 +; SI-NEXT: .LBB101_3: +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: s_branch .LBB101_2 +; SI-NEXT: .LBB101_4: +; SI-NEXT: v_mov_b32_e32 v9, s47 +; SI-NEXT: v_mov_b32_e32 v8, s46 +; SI-NEXT: v_mov_b32_e32 v10, s45 +; SI-NEXT: v_mov_b32_e32 v7, s44 +; SI-NEXT: v_mov_b32_e32 v11, s43 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v12, s41 +; SI-NEXT: v_mov_b32_e32 v5, s40 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v4, s28 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: v_mov_b32_e32 v3, s26 +; SI-NEXT: v_mov_b32_e32 v15, s25 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: v_mov_b32_e32 v1, s15 +; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: .LBB101_5: ; %end ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_lshr_b64 v[3:4], v[4:5], 16 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v9 -; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v15 -; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v11 -; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[15:16], 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[13:14], 16 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v5 +; SI-NEXT: v_lshr_b64 v[4:5], v[12:13], 16 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[11:12], 16 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 +; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[7:8], v[8:9], 16 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB101_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_branch .LBB101_2 ; ; VI-LABEL: bitcast_v16f16_to_v16bf16_scalar: ; VI: ; %bb.0: @@ -35540,195 +34993,171 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v9 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_alignbit_b32 v11, v1, v17, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_alignbit_b32 v12, v11, v0, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v19, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_alignbit_b32 v13, v10, v0, 16 +; SI-NEXT: v_alignbit_b32 v6, v5, v21, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_alignbit_b32 v14, v6, v0, 16 +; SI-NEXT: v_alignbit_b32 v8, v7, v25, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_alignbit_b32 v15, v8, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v23 +; SI-NEXT: v_alignbit_b32 v0, v0, v31, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v30, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v29, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v28, 16 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: .LBB102_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v26 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v23 +; SI-NEXT: v_alignbit_b32 v4, v4, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_alignbit_b32 v9, v6, v1, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v8, v7, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v6, v5, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v10, v3, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v11, v1, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v12, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, v10, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, v6, v14, 16 +; SI-NEXT: v_alignbit_b32 v15, v8, v15, 16 ; SI-NEXT: .LBB102_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_or_b32_e32 v2, v2, v11 +; SI-NEXT: v_or_b32_e32 v4, v4, v10 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16bf16_to_v16f16: @@ -36366,173 +35795,145 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg % ; SI-NEXT: s_and_b32 s19, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s15 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s14 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s13 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s12 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s11 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s10 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s9 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s8 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s7 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s6 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s5 -; SI-NEXT: v_mul_f32_e64 v31, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s7 ; SI-NEXT: s_cbranch_scc0 .LBB103_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v39 +; SI-NEXT: v_lshr_b64 v[13:14], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_lshr_b64 v[10:11], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[7:8], v[15:16], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v50 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v48 +; SI-NEXT: v_lshr_b64 v[33:34], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[27:28], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB103_3 ; SI-NEXT: .LBB103_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshr_b64 v[5:6], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_lshr_b64 v[21:22], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_lshr_b64 v[22:23], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshr_b64 v[25:26], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 16 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshr_b64 v[7:8], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[17:18], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; SI-NEXT: v_lshr_b64 v[13:14], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_lshr_b64 v[33:34], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[6:7], 16 ; SI-NEXT: .LBB103_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v33 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v18 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB103_4: ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_branch .LBB103_2 ; ; VI-LABEL: bitcast_v16bf16_to_v16f16_scalar: @@ -37266,46 +36667,22 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v16f16_to_v32i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 +; SI-NEXT: v_mov_b32_e32 v37, v7 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v38, v4 +; SI-NEXT: v_mov_b32_e32 v34, v3 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v33, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v33 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -37345,22 +36722,30 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB104_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_or_b32_e32 v8, v36, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_or_b32_e32 v12, v35, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_or_b32_e32 v16, v39, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_or_b32_e32 v20, v38, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v24, v50, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v30 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v4, v32, v1 -; SI-NEXT: v_or_b32_e32 v28, v49, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 +; SI-NEXT: v_or_b32_e32 v8, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_or_b32_e32 v12, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_or_b32_e32 v16, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v51 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v24, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 +; SI-NEXT: v_or_b32_e32 v28, v5, v7 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -37381,25 +36766,25 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 ; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB104_2 ; SI-NEXT: .LBB104_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -37407,9 +36792,9 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: v_or_b32_e32 v24, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -37421,28 +36806,28 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v16, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 ; SI-NEXT: v_or_b32_e32 v20, v0, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_or_b32_e32 v8, v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 ; SI-NEXT: v_or_b32_e32 v12, v1, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 @@ -37937,219 +37322,221 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; SI-LABEL: bitcast_v16f16_to_v32i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 +; SI-NEXT: s_lshr_b32 s78, s23, 16 +; SI-NEXT: s_lshr_b32 s79, s22, 16 +; SI-NEXT: s_lshr_b32 s76, s21, 16 +; SI-NEXT: s_lshr_b32 s77, s20, 16 +; SI-NEXT: s_lshr_b32 s74, s19, 16 +; SI-NEXT: s_lshr_b32 s75, s18, 16 +; SI-NEXT: s_lshr_b32 s72, s17, 16 +; SI-NEXT: s_lshr_b32 s73, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: s_cbranch_scc0 .LBB105_4 +; SI-NEXT: s_cbranch_scc0 .LBB105_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_or_b32_e32 v48, v8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v49, v0, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v35, v39, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_or_b32_e32 v36, v32, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_or_b32_e32 v37, v54, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_or_b32_e32 v38, v53, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 -; SI-NEXT: v_lshr_b64 v[11:12], v[35:36], 24 -; SI-NEXT: v_or_b32_e32 v33, v41, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[12:13], v[35:36], 16 -; SI-NEXT: v_or_b32_e32 v34, v40, v2 -; SI-NEXT: v_lshr_b64 v[24:25], v[37:38], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[35:36], 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v36 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v38 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 -; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 -; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[10:11], 16 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s75, 16 +; SI-NEXT: s_or_b32 s26, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s74, 16 +; SI-NEXT: s_or_b32 s27, s5, s7 +; SI-NEXT: s_and_b32 s5, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s77, 16 +; SI-NEXT: s_or_b32 s44, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s76, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s22, 0xffff +; SI-NEXT: s_lshl_b32 s7, s79, 16 +; SI-NEXT: s_or_b32 s60, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s78, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[12:13], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[14:15], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[28:29], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[40:41], s[44:45], 16 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_lshr_b64 s[42:43], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[60:61], 24 +; SI-NEXT: s_lshr_b64 s[56:57], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[60:61], 8 +; SI-NEXT: s_lshr_b32 s5, s11, 8 +; SI-NEXT: s_lshr_b32 s7, s27, 8 +; SI-NEXT: s_lshr_b32 s13, s45, 8 +; SI-NEXT: s_lshr_b32 s15, s61, 8 +; SI-NEXT: s_bfe_u32 s9, s72, 0x80008 +; SI-NEXT: s_bfe_u32 s25, s74, 0x80008 +; SI-NEXT: s_bfe_u32 s29, s76, 0x80008 +; SI-NEXT: s_bfe_u32 s41, s78, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB105_4 +; SI-NEXT: .LBB105_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s22 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v33, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s23 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 ; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 -; SI-NEXT: v_lshr_b64 v[19:20], v[37:38], 24 -; SI-NEXT: v_lshr_b64 v[17:18], v[37:38], 8 +; SI-NEXT: v_or_b32_e32 v34, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s20 +; SI-NEXT: v_lshr_b64 v[24:25], v[33:34], 16 ; SI-NEXT: v_lshr_b64 v[27:28], v[33:34], 24 -; SI-NEXT: v_lshr_b64 v[50:51], v[33:34], 16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshr_b64 v[25:26], v[33:34], 8 -; SI-NEXT: s_cbranch_execnz .LBB105_3 -; SI-NEXT: .LBB105_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 +; SI-NEXT: v_or_b32_e32 v35, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s21 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 -; SI-NEXT: v_or_b32_e32 v33, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_or_b32_e32 v34, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v37, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 +; SI-NEXT: v_or_b32_e32 v36, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_lshr_b64 v[19:20], v[35:36], 24 +; SI-NEXT: v_lshr_b64 v[20:21], v[35:36], 16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 -; SI-NEXT: v_or_b32_e32 v38, v1, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v35, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; SI-NEXT: v_or_b32_e32 v36, v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshr_b64 v[17:18], v[35:36], 8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v36 +; SI-NEXT: v_or_b32_e32 v37, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s19 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: v_or_b32_e32 v38, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_lshr_b64 v[11:12], v[37:38], 24 +; SI-NEXT: v_lshr_b64 v[12:13], v[37:38], 16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v48, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v49, v0, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshr_b64 v[9:10], v[37:38], 8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v38 +; SI-NEXT: v_or_b32_e32 v48, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_or_b32_e32 v49, v1, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[48:49], 16 ; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 -; SI-NEXT: v_lshr_b64 v[11:12], v[35:36], 24 -; SI-NEXT: v_lshr_b64 v[24:25], v[37:38], 16 -; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 -; SI-NEXT: v_lshr_b64 v[12:13], v[35:36], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[35:36], 8 -; SI-NEXT: v_lshr_b64 v[19:20], v[37:38], 24 -; SI-NEXT: v_lshr_b64 v[17:18], v[37:38], 8 -; SI-NEXT: v_lshr_b64 v[27:28], v[33:34], 24 -; SI-NEXT: v_lshr_b64 v[50:51], v[33:34], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[33:34], 8 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v36 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v38 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 -; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 -; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 -; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 -; SI-NEXT: .LBB105_3: ; %end -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB105_5 +; SI-NEXT: .LBB105_3: +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: s_branch .LBB105_2 +; SI-NEXT: .LBB105_4: +; SI-NEXT: v_mov_b32_e32 v30, s78 +; SI-NEXT: v_mov_b32_e32 v22, s76 +; SI-NEXT: v_mov_b32_e32 v14, s74 +; SI-NEXT: v_mov_b32_e32 v6, s72 +; SI-NEXT: v_mov_b32_e32 v31, s41 +; SI-NEXT: v_mov_b32_e32 v23, s29 +; SI-NEXT: v_mov_b32_e32 v15, s25 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v49, s11 +; SI-NEXT: v_mov_b32_e32 v48, s10 +; SI-NEXT: v_mov_b32_e32 v38, s27 +; SI-NEXT: v_mov_b32_e32 v37, s26 +; SI-NEXT: v_mov_b32_e32 v36, s45 +; SI-NEXT: v_mov_b32_e32 v35, s44 +; SI-NEXT: v_mov_b32_e32 v34, s61 +; SI-NEXT: v_mov_b32_e32 v33, s60 +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_mov_b32_e32 v13, s7 +; SI-NEXT: v_mov_b32_e32 v21, s13 +; SI-NEXT: v_mov_b32_e32 v29, s15 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v11, s12 +; SI-NEXT: v_mov_b32_e32 v12, s14 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_mov_b32_e32 v20, s40 +; SI-NEXT: v_mov_b32_e32 v17, s42 +; SI-NEXT: v_mov_b32_e32 v27, s46 +; SI-NEXT: v_mov_b32_e32 v24, s56 +; SI-NEXT: v_mov_b32_e32 v25, s58 +; SI-NEXT: .LBB105_5: ; %end +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: v_mov_b32_e32 v0, v48 -; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v4, v49 -; SI-NEXT: v_mov_b32_e32 v8, v35 ; SI-NEXT: v_mov_b32_e32 v10, v12 -; SI-NEXT: v_mov_b32_e32 v12, v36 -; SI-NEXT: v_mov_b32_e32 v16, v37 -; SI-NEXT: v_mov_b32_e32 v18, v24 -; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: v_mov_b32_e32 v8, v37 +; SI-NEXT: v_mov_b32_e32 v12, v38 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v16, v35 +; SI-NEXT: v_mov_b32_e32 v20, v36 +; SI-NEXT: v_mov_b32_e32 v26, v24 ; SI-NEXT: v_mov_b32_e32 v24, v33 -; SI-NEXT: v_mov_b32_e32 v26, v50 ; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB105_4: -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: s_branch .LBB105_2 ; ; VI-LABEL: bitcast_v16f16_to_v32i8_scalar: ; VI: ; %bb.0: @@ -38588,95 +37975,111 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v32i8_to_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v33, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v19 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v34, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 24, v11 ; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25 -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v27 -; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v55 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v19 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v34, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v23, v0, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v0, v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v0, v0, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v0, v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 +; SI-NEXT: v_or_b32_e32 v9, v36, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 +; SI-NEXT: v_or_b32_e32 v10, v37, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v15, v0, v9 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v0, v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v0, v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v0, v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v31, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v10 +; SI-NEXT: v_or_b32_e32 v11, v51, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_alignbit_b32 v21, v5, v11, 16 +; SI-NEXT: v_or_b32_e32 v7, v7, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v11, v0, v11 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v0, v0, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v0, v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 -; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v12, v52, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v0, v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: v_or_b32_e32 v14, v54, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v13, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v9, 16 +; SI-NEXT: v_alignbit_b32 v25, v7, v14, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -38694,140 +38097,149 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: .LBB106_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 ; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_add_i32_e32 v30, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v54, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v30 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v51, v1 -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 ; SI-NEXT: v_or_b32_e32 v1, v50, v1 -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v49, v1 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v37, v1 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v14 ; SI-NEXT: v_or_b32_e32 v1, v35, v1 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 ; SI-NEXT: v_or_b32_e32 v0, v32, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v31, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v0 +; SI-NEXT: v_alignbit_b32 v13, v1, v23, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v15, 16 +; SI-NEXT: v_alignbit_b32 v21, v5, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v7, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 ; SI-NEXT: .LBB106_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i8_to_v16f16: @@ -39537,240 +38949,272 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_readfirstlane_b32 s46, v17 -; SI-NEXT: v_readfirstlane_b32 s47, v16 -; SI-NEXT: v_readfirstlane_b32 s44, v15 -; SI-NEXT: v_readfirstlane_b32 s45, v14 -; SI-NEXT: v_readfirstlane_b32 s42, v13 -; SI-NEXT: v_readfirstlane_b32 s43, v12 -; SI-NEXT: v_readfirstlane_b32 s40, v11 -; SI-NEXT: v_readfirstlane_b32 s41, v10 -; SI-NEXT: v_readfirstlane_b32 s14, v9 -; SI-NEXT: v_readfirstlane_b32 s15, v8 -; SI-NEXT: v_readfirstlane_b32 s12, v7 -; SI-NEXT: v_readfirstlane_b32 s13, v6 -; SI-NEXT: v_readfirstlane_b32 s10, v5 -; SI-NEXT: v_readfirstlane_b32 s11, v4 -; SI-NEXT: v_readfirstlane_b32 s7, v3 -; SI-NEXT: v_readfirstlane_b32 s9, v2 -; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: v_readfirstlane_b32 s63, v17 +; SI-NEXT: v_readfirstlane_b32 s62, v16 +; SI-NEXT: v_readfirstlane_b32 s74, v15 +; SI-NEXT: v_readfirstlane_b32 s75, v14 +; SI-NEXT: v_readfirstlane_b32 s77, v13 +; SI-NEXT: v_readfirstlane_b32 s76, v12 +; SI-NEXT: v_readfirstlane_b32 s78, v11 +; SI-NEXT: v_readfirstlane_b32 s79, v10 +; SI-NEXT: v_readfirstlane_b32 s57, v9 +; SI-NEXT: v_readfirstlane_b32 s56, v8 +; SI-NEXT: v_readfirstlane_b32 s58, v7 +; SI-NEXT: v_readfirstlane_b32 s59, v6 +; SI-NEXT: v_readfirstlane_b32 s61, v5 +; SI-NEXT: v_readfirstlane_b32 s60, v4 +; SI-NEXT: v_readfirstlane_b32 s72, v3 +; SI-NEXT: v_readfirstlane_b32 s73, v2 +; SI-NEXT: v_readfirstlane_b32 s46, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: v_readfirstlane_b32 s47, v0 ; SI-NEXT: s_cbranch_scc0 .LBB107_4 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s8, s20, 0xff +; SI-NEXT: s_lshl_b32 s9, s21, 8 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s22, 0xff ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s23, 24 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s19, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s23, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s44, s11, s9 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_or_b32 s13, s8, s44 +; SI-NEXT: s_and_b32 s8, s28, 0xff +; SI-NEXT: s_lshl_b32 s9, s29, 8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s12, s6, s5 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s47, 0xff +; SI-NEXT: s_or_b32 s10, s4, s12 ; SI-NEXT: s_and_b32 s4, s24, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s46, 24 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s5, s27, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s6, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s5, s7, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s5, s10, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: s_lshl_b32 s5, s12, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xff -; SI-NEXT: s_lshl_b32 s5, s14, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xff -; SI-NEXT: s_lshl_b32 s5, s40, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s42, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: s_lshl_b32 s5, s44, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s47, 0xff -; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s45, s11, s9 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s27, 24 +; SI-NEXT: s_or_b32 s15, s8, s45 +; SI-NEXT: s_and_b32 s8, s59, 0xff +; SI-NEXT: s_lshl_b32 s9, s58, 8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s14, s6, s5 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s56, 0xff +; SI-NEXT: s_or_b32 s6, s4, s14 +; SI-NEXT: s_and_b32 s4, s73, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s57, 24 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s88, s11, s9 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s61, 24 +; SI-NEXT: s_or_b32 s41, s8, s88 +; SI-NEXT: s_and_b32 s8, s75, 0xff +; SI-NEXT: s_lshl_b32 s9, s74, 8 +; SI-NEXT: s_or_b32 s40, s7, s5 +; SI-NEXT: s_and_b32 s5, s79, 0xff +; SI-NEXT: s_lshl_b32 s7, s78, 8 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s62, 0xff +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s76, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s63, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_or_b32 s89, s11, s9 +; SI-NEXT: s_lshl_b32 s9, s77, 24 +; SI-NEXT: s_or_b32 s42, s9, s7 +; SI-NEXT: s_and_b32 s7, s8, 0xffff +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s43, s7, s89 +; SI-NEXT: s_or_b32 s4, s4, s40 +; SI-NEXT: s_or_b32 s8, s5, s42 +; SI-NEXT: s_mov_b32 s11, s13 +; SI-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 +; SI-NEXT: s_mov_b32 s7, s15 +; SI-NEXT: s_lshr_b64 s[14:15], s[14:15], 16 +; SI-NEXT: s_mov_b32 s5, s41 +; SI-NEXT: s_lshr_b64 s[40:41], s[40:41], 16 +; SI-NEXT: s_mov_b32 s9, s43 +; SI-NEXT: s_lshr_b64 s[42:43], s[42:43], 16 +; SI-NEXT: s_lshr_b32 s15, s44, 16 +; SI-NEXT: s_lshr_b32 s41, s45, 16 +; SI-NEXT: s_lshr_b32 s43, s88, 16 +; SI-NEXT: s_lshr_b32 s13, s89, 16 ; SI-NEXT: s_cbranch_execnz .LBB107_3 ; SI-NEXT: .LBB107_2: ; %cmp.true -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_and_b32 s9, s9, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 8 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_and_b32 s11, s11, 0xff -; SI-NEXT: s_lshl_b32 s10, s10, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: s_and_b32 s8, s28, 0xff -; SI-NEXT: s_lshl_b32 s9, s29, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_and_b32 s13, s13, 0xff -; SI-NEXT: s_lshl_b32 s12, s12, 8 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s26, 0xff -; SI-NEXT: s_lshl_b32 s11, s27, 8 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s47, s47, 3 -; SI-NEXT: s_and_b32 s15, s15, 0xff -; SI-NEXT: s_lshl_b32 s14, s14, 8 -; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s11, s24, 0xff -; SI-NEXT: s_lshl_b32 s13, s25, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_and_b32 s4, s47, 0xff -; SI-NEXT: s_lshl_b32 s5, s46, 8 -; SI-NEXT: s_add_i32 s45, s45, 3 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: s_or_b32 s11, s13, s11 -; SI-NEXT: s_and_b32 s13, s22, 0xff -; SI-NEXT: s_lshl_b32 s15, s23, 8 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s79, s79, 3 +; SI-NEXT: s_and_b32 s4, s79, 0xff +; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: s_add_i32 s76, s76, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s45, 0xff -; SI-NEXT: s_lshl_b32 s44, s44, 8 -; SI-NEXT: s_and_b32 s43, s43, 0xff -; SI-NEXT: s_lshl_b32 s42, s42, 8 -; SI-NEXT: s_and_b32 s41, s41, 0xff -; SI-NEXT: s_lshl_b32 s40, s40, 8 -; SI-NEXT: s_or_b32 s13, s15, s13 -; SI-NEXT: s_and_b32 s15, s20, 0xff -; SI-NEXT: s_lshl_b32 s20, s21, 8 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 8 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_or_b32 s5, s44, s5 -; SI-NEXT: s_or_b32 s42, s42, s43 -; SI-NEXT: s_or_b32 s40, s40, s41 -; SI-NEXT: s_or_b32 s15, s20, s15 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s6, s76, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s77, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s75, s75, 3 +; SI-NEXT: s_add_i32 s8, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s75, 0xff +; SI-NEXT: s_lshl_b32 s5, s74, 8 +; SI-NEXT: s_add_i32 s62, s62, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s62, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s63, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s73, s73, 3 +; SI-NEXT: s_add_i32 s9, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s73, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_add_i32 s60, s60, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s60, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s61, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s59, s59, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s59, 0xff +; SI-NEXT: s_lshl_b32 s6, s58, 8 +; SI-NEXT: s_add_i32 s56, s56, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s56, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_addk_i32 s42, 0x300 -; SI-NEXT: s_addk_i32 s40, 0x300 -; SI-NEXT: s_addk_i32 s14, 0x300 -; SI-NEXT: s_addk_i32 s12, 0x300 -; SI-NEXT: s_addk_i32 s10, 0x300 -; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_lshl_b32 s6, s57, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s10, s26, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_addk_i32 s8, 0x300 -; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s10, s29, 8 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_or_b32 s7, s10, s7 +; SI-NEXT: s_and_b32 s11, s47, 0xff +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_lshl_b32 s10, s46, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s7, s10, s7 +; SI-NEXT: s_and_b32 s10, s16, 0xff +; SI-NEXT: s_lshl_b32 s11, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s12, s18, 0xff +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_lshl_b32 s11, s19, 24 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s20, 0xff +; SI-NEXT: s_lshl_b32 s12, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s13, s22, 0xff ; SI-NEXT: s_addk_i32 s11, 0x300 -; SI-NEXT: s_addk_i32 s13, 0x300 -; SI-NEXT: s_addk_i32 s15, 0x300 -; SI-NEXT: s_addk_i32 s18, 0x300 -; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshl_b32 s12, s23, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_lshr_b64 s[12:13], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 +; SI-NEXT: s_lshr_b32 s15, s11, 16 +; SI-NEXT: s_lshr_b32 s41, s7, 16 +; SI-NEXT: s_lshr_b32 s43, s5, 16 +; SI-NEXT: s_lshr_b32 s13, s9, 16 ; SI-NEXT: .LBB107_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s12, s15, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s6, s6, s12 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s12, s41, 16 +; SI-NEXT: s_or_b32 s7, s7, s12 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s12, s40, 16 +; SI-NEXT: s_or_b32 s4, s4, s12 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s12, s43, 16 +; SI-NEXT: s_or_b32 s5, s5, s12 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s12, s42, 16 +; SI-NEXT: s_or_b32 s8, s8, s12 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s12, s13, 16 +; SI-NEXT: s_or_b32 s9, s9, s12 +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_mov_b32_e32 v7, s9 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB107_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: s_branch .LBB107_2 ; ; VI-LABEL: bitcast_v32i8_to_v16f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll index 6c8abf8733579..6656733d53e51 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll @@ -1330,142 +1330,78 @@ define <18 x half> @bitcast_v9i32_to_v18f16(<9 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: .LBB8_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 +; SI-NEXT: v_alignbit_b32 v10, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v11, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v14, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB8_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_alignbit_b32 v10, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v11, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v14, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 ; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v4, v4, v11 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v10 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v11 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i32_to_v18f16: @@ -1556,128 +1492,83 @@ define inreg <18 x half> @bitcast_v9i32_to_v18f16_scalar(<9 x i32> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s25, 0 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 +; SI-NEXT: s_lshr_b32 s25, s23, 16 +; SI-NEXT: s_lshr_b32 s26, s21, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s17, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true -; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_lshr_b32 s25, s23, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s26, s21, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s17, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 ; SI-NEXT: .LBB9_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v15, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v15, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v5, v11, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v1, v18, v1 -; SI-NEXT: v_or_b32_e32 v4, v15, v4 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s12, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s28, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s19, 0xffff +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s20, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s11, s6 +; SI-NEXT: s_and_b32 s11, s21, 0xffff +; SI-NEXT: s_lshl_b32 s12, s26, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s22, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s12, s4 +; SI-NEXT: s_and_b32 s12, s23, 0xffff +; SI-NEXT: s_lshl_b32 s13, s25, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s24, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s13, s8 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_v9i32_to_v18f16_scalar: @@ -1792,51 +1683,24 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v18f16_to_v9i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 +; SI-NEXT: v_mov_b32_e32 v18, v8 +; SI-NEXT: v_mov_b32_e32 v10, v7 +; SI-NEXT: v_mov_b32_e32 v11, v6 +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_mov_b32_e32 v14, v3 +; SI-NEXT: v_mov_b32_e32 v15, v2 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1849,24 +1713,42 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v26, v0 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v2, v22, v2 -; SI-NEXT: v_or_b32_e32 v3, v20, v3 -; SI-NEXT: v_or_b32_e32 v4, v18, v4 -; SI-NEXT: v_or_b32_e32 v5, v16, v5 -; SI-NEXT: v_or_b32_e32 v6, v14, v6 -; SI-NEXT: v_or_b32_e32 v7, v12, v7 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 @@ -1876,22 +1758,13 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -1904,10 +1777,10 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -1916,25 +1789,25 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v13 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v11 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -1942,12 +1815,12 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -2070,121 +1943,94 @@ define inreg <9 x i32> @bitcast_v18f16_to_v9i32_scalar(<18 x half> inreg %a, i32 ; SI-LABEL: bitcast_v18f16_to_v9i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_lshr_b32 s13, s24, 16 +; SI-NEXT: s_lshr_b32 s26, s23, 16 +; SI-NEXT: s_lshr_b32 s27, s22, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s25, 0 -; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_or_b32_e32 v1, v23, v1 -; SI-NEXT: v_or_b32_e32 v2, v21, v2 -; SI-NEXT: v_or_b32_e32 v3, v19, v3 -; SI-NEXT: v_or_b32_e32 v4, v17, v4 -; SI-NEXT: v_or_b32_e32 v5, v15, v5 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s41, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s40, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s29, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s28, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s26, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s14, s13, 16 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_cbranch_execnz .LBB11_4 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s40 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s29 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -2192,29 +2038,39 @@ define inreg <9 x i32> @bitcast_v18f16_to_v9i32_scalar(<18 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s26 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB11_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: .LBB11_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 ; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f16_to_v9i32_scalar: ; VI: ; %bb.0: @@ -3233,142 +3089,78 @@ define <18 x half> @bitcast_v9f32_to_v18f16(<9 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 +; SI-NEXT: v_alignbit_b32 v10, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v11, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v14, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_alignbit_b32 v10, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v11, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v14, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, v0, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v4, v4, v11 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v10 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v11 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f32_to_v18f16: @@ -3452,131 +3244,97 @@ define inreg <18 x half> @bitcast_v9f32_to_v18f16_scalar(<9 x float> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s25, 0 -; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: s_cbranch_scc0 .LBB17_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 -; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: s_lshr_b32 s25, s17, 16 +; SI-NEXT: s_lshr_b32 s28, s23, 16 +; SI-NEXT: s_lshr_b32 s27, s21, 16 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB17_4 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v17, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 ; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s24, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v15, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v15, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v5, v11, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v1, v18, v1 -; SI-NEXT: v_or_b32_e32 v4, v15, v4 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[9:10], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[13:14], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: s_branch .LBB17_5 +; SI-NEXT: .LBB17_3: +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB17_2 +; SI-NEXT: .LBB17_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_mov_b32_e32 v17, s27 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v14, s12 +; SI-NEXT: v_mov_b32_e32 v13, s10 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: .LBB17_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v11 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v11 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v16 +; SI-NEXT: v_or_b32_e32 v4, v4, v11 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v11 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v9f32_to_v18f16_scalar: ; VI: ; %bb.0: @@ -3713,51 +3471,24 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v18f16_to_v9f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v8 +; SI-NEXT: v_mov_b32_e32 v18, v8 +; SI-NEXT: v_mov_b32_e32 v10, v7 +; SI-NEXT: v_mov_b32_e32 v11, v6 +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_mov_b32_e32 v14, v3 +; SI-NEXT: v_mov_b32_e32 v15, v2 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3770,24 +3501,42 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v26, v0 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v2, v22, v2 -; SI-NEXT: v_or_b32_e32 v3, v20, v3 -; SI-NEXT: v_or_b32_e32 v4, v18, v4 -; SI-NEXT: v_or_b32_e32 v5, v16, v5 -; SI-NEXT: v_or_b32_e32 v6, v14, v6 -; SI-NEXT: v_or_b32_e32 v7, v12, v7 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 @@ -3797,22 +3546,13 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -3825,10 +3565,10 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -3837,25 +3577,25 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v13 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v11 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -3863,12 +3603,12 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -3991,121 +3731,94 @@ define inreg <9 x float> @bitcast_v18f16_to_v9f32_scalar(<18 x half> inreg %a, i ; SI-LABEL: bitcast_v18f16_to_v9f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_lshr_b32 s13, s24, 16 +; SI-NEXT: s_lshr_b32 s26, s23, 16 +; SI-NEXT: s_lshr_b32 s27, s22, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s25, 0 -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_or_b32_e32 v1, v23, v1 -; SI-NEXT: v_or_b32_e32 v2, v21, v2 -; SI-NEXT: v_or_b32_e32 v3, v19, v3 -; SI-NEXT: v_or_b32_e32 v4, v17, v4 -; SI-NEXT: v_or_b32_e32 v5, v15, v5 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s41, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s40, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s29, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s28, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s26, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s14, s13, 16 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s40 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s29 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -4113,29 +3826,39 @@ define inreg <9 x float> @bitcast_v18f16_to_v9f32_scalar(<18 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s26 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f16_to_v9f32_scalar: ; VI: ; %bb.0: @@ -4293,56 +4016,64 @@ define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v18i16_to_v18f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v19, v1, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v22, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v17, v1, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v20, v0, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v15, v1, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v18, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v9, v1, v35 +; SI-NEXT: v_or_b32_e32 v14, v0, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_alignbit_b32 v21, v19, v28, 16 +; SI-NEXT: v_alignbit_b32 v23, v17, v30, 16 +; SI-NEXT: v_alignbit_b32 v24, v15, v32, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v34, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v27 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -4352,93 +4083,94 @@ define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v31 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v29 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v4, v33, v4 +; SI-NEXT: v_or_b32_e32 v2, v31, v2 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v21, v19, v22, 16 +; SI-NEXT: v_alignbit_b32 v23, v17, v20, 16 +; SI-NEXT: v_alignbit_b32 v24, v15, v18, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v14, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 ; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v19 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v16 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v24 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v24 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i16_to_v18f16: @@ -4546,131 +4278,157 @@ define inreg <18 x half> @bitcast_v18i16_to_v18f16_scalar(<18 x i16> inreg %a, i ; SI-LABEL: bitcast_v18i16_to_v18f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s42, s24, 16 +; SI-NEXT: s_lshr_b32 s46, s23, 16 +; SI-NEXT: s_lshr_b32 s58, s22, 16 +; SI-NEXT: s_lshr_b32 s45, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s20, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s56, s18, 16 +; SI-NEXT: s_lshr_b32 s43, s17, 16 +; SI-NEXT: s_lshr_b32 s47, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s25, 0 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 +; SI-NEXT: s_and_b32 s5, s24, 0xffff +; SI-NEXT: s_lshl_b32 s7, s42, 16 +; SI-NEXT: s_or_b32 s25, s5, s7 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s43, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s40, s47, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s44, 16 +; SI-NEXT: s_or_b32 s10, s4, s40 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s60, s56, 16 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s45, 16 +; SI-NEXT: s_or_b32 s8, s4, s60 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s62, s57, 16 +; SI-NEXT: s_or_b32 s63, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s46, 16 +; SI-NEXT: s_or_b32 s6, s4, s62 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s72, s58, 16 +; SI-NEXT: s_or_b32 s73, s5, s7 +; SI-NEXT: s_or_b32 s4, s4, s72 +; SI-NEXT: s_lshr_b64 s[12:13], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[72:73], 16 +; SI-NEXT: s_mov_b32 s11, s41 +; SI-NEXT: s_mov_b32 s9, s61 +; SI-NEXT: s_mov_b32 s7, s63 +; SI-NEXT: s_mov_b32 s5, s73 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true -; SI-NEXT: s_add_i32 s14, s14, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s6, s46, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s57, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s21, 0xffff +; SI-NEXT: s_lshl_b32 s8, s45, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s18, 0xffff +; SI-NEXT: s_lshl_b32 s9, s56, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s19, 0xffff +; SI-NEXT: s_lshl_b32 s10, s44, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s16, 0xffff +; SI-NEXT: s_lshl_b32 s11, s47, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s17, 0xffff +; SI-NEXT: s_lshl_b32 s12, s43, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s42, 16 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s25, s12, 0x30000 +; SI-NEXT: s_lshr_b64 s[12:13], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s43, s11, 16 +; SI-NEXT: s_lshr_b32 s44, s9, 16 +; SI-NEXT: s_lshr_b32 s45, s7, 16 +; SI-NEXT: s_lshr_b32 s46, s5, 16 +; SI-NEXT: s_lshr_b32 s42, s25, 16 ; SI-NEXT: .LBB21_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v0, v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v2, v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v3, v3, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v4, v4, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v5, v5, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v17 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v1, v1, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s12, s43, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s12, s14, 16 +; SI-NEXT: s_or_b32 s8, s8, s12 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s12, s44, 16 +; SI-NEXT: s_or_b32 s9, s9, s12 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s12, s26, 16 +; SI-NEXT: s_or_b32 s6, s6, s12 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s12, s45, 16 +; SI-NEXT: s_or_b32 s7, s7, s12 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s12, s28, 16 +; SI-NEXT: s_or_b32 s4, s4, s12 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s12, s46, 16 +; SI-NEXT: s_or_b32 s5, s5, s12 +; SI-NEXT: s_and_b32 s12, s25, 0xffff +; SI-NEXT: s_lshl_b32 s13, s42, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: v_mov_b32_e32 v7, s5 +; SI-NEXT: v_mov_b32_e32 v8, s12 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: s_branch .LBB21_2 ; ; VI-LABEL: bitcast_v18i16_to_v18f16_scalar: @@ -4836,162 +4594,126 @@ define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v18f16_to_v18i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_or_b32_e32 v11, v11, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v8, v8, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v12, v12, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_or_b32_e32 v10, v10, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_or_b32_e32 v14, v14, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_or_b32_e32 v7, v7, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v2, v2, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v18 ; SI-NEXT: v_or_b32_e32 v0, v0, v9 -; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v17 ; SI-NEXT: v_or_b32_e32 v4, v4, v16 ; SI-NEXT: v_or_b32_e32 v6, v6, v15 -; SI-NEXT: v_alignbit_b32 v18, v2, v9, 16 -; SI-NEXT: v_alignbit_b32 v17, v14, v17, 16 -; SI-NEXT: v_alignbit_b32 v16, v10, v16, 16 -; SI-NEXT: v_alignbit_b32 v15, v12, v15, 16 +; SI-NEXT: v_alignbit_b32 v18, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v17, 16 +; SI-NEXT: v_alignbit_b32 v16, v5, v16, 16 +; SI-NEXT: v_alignbit_b32 v15, v7, v15, 16 ; SI-NEXT: .LBB22_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v9 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 ; SI-NEXT: v_or_b32_e32 v2, v2, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v16 ; SI-NEXT: v_or_b32_e32 v4, v4, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f16_to_v18i16: @@ -5101,163 +4823,147 @@ define inreg <18 x i16> @bitcast_v18f16_to_v18i16_scalar(<18 x half> inreg %a, i ; SI-LABEL: bitcast_v18f16_to_v18i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s10, s22, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s12, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s14, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s25, 0 -; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: s_cbranch_scc0 .LBB23_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: s_cbranch_execnz .LBB23_4 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v17, v8, v0 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v16, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s12 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_or_b32_e32 v19, v8, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_or_b32_e32 v22, v9, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v24, v8, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 +; SI-NEXT: v_or_b32_e32 v18, v3, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v9 +; SI-NEXT: v_or_b32_e32 v20, v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_or_b32_e32 v19, v5, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s23 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s21 +; SI-NEXT: v_or_b32_e32 v7, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s19 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_or_b32_e32 v3, v3, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s24 +; SI-NEXT: v_or_b32_e32 v5, v8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s17 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 -; SI-NEXT: v_or_b32_e32 v23, v8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v8 -; SI-NEXT: v_lshr_b64 v[12:13], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[13:14], v[2:3], 16 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_or_b32_e32 v24, v1, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[2:3], 16 ; SI-NEXT: v_lshr_b64 v[10:11], v[4:5], 16 ; SI-NEXT: v_lshr_b64 v[8:9], v[6:7], 16 -; SI-NEXT: .LBB23_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: s_branch .LBB23_5 +; SI-NEXT: .LBB23_3: +; SI-NEXT: s_branch .LBB23_2 +; SI-NEXT: .LBB23_4: +; SI-NEXT: v_mov_b32_e32 v17, s11 +; SI-NEXT: v_mov_b32_e32 v21, s9 +; SI-NEXT: v_mov_b32_e32 v22, s8 +; SI-NEXT: v_mov_b32_e32 v23, s7 +; SI-NEXT: v_mov_b32_e32 v25, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v24, s24 +; SI-NEXT: v_mov_b32_e32 v19, s22 +; SI-NEXT: v_mov_b32_e32 v20, s20 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v14, s14 +; SI-NEXT: v_mov_b32_e32 v12, s13 +; SI-NEXT: v_mov_b32_e32 v10, s12 +; SI-NEXT: v_mov_b32_e32 v8, s10 +; SI-NEXT: .LBB23_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v18 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 ; SI-NEXT: v_or_b32_e32 v4, v4, v6 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v6, v6, v8 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB23_4: -; SI-NEXT: s_branch .LBB23_2 ; ; VI-LABEL: bitcast_v18f16_to_v18i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 20a8e6dc2727e..bb0e13e2997e7 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -1419,156 +1419,86 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB8_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v14, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB8_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB8_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v14, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i32_to_v20f16: @@ -1664,141 +1594,91 @@ define inreg <20 x half> @bitcast_v10i32_to_v20f16_scalar(<10 x i32> inreg %a, i ; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: s_lshr_b32 s27, s23, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: s_lshr_b32 s27, s23, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 ; SI-NEXT: .LBB9_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v4, v14, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; SI-NEXT: v_or_b32_e32 v6, v12, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v7, v7, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_or_b32_e32 v3, v18, v3 -; SI-NEXT: v_or_b32_e32 v5, v16, v5 -; SI-NEXT: v_or_b32_e32 v8, v11, v8 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s12, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s40, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s19, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s20, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s11, s8 +; SI-NEXT: s_and_b32 s11, s21, 0xffff +; SI-NEXT: s_lshl_b32 s12, s28, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s22, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s12, s6 +; SI-NEXT: s_and_b32 s12, s23, 0xffff +; SI-NEXT: s_lshl_b32 s13, s27, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s24, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s13, s4 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s26, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s13 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_v10i32_to_v20f16_scalar: @@ -1918,56 +1798,26 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v10i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_mov_b32_e32 v12, v7 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_mov_b32_e32 v14, v5 +; SI-NEXT: v_mov_b32_e32 v15, v4 +; SI-NEXT: v_mov_b32_e32 v16, v3 +; SI-NEXT: v_mov_b32_e32 v17, v2 +; SI-NEXT: v_mov_b32_e32 v18, v1 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1980,26 +1830,46 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_or_b32_e32 v2, v25, v2 -; SI-NEXT: v_or_b32_e32 v3, v23, v3 -; SI-NEXT: v_or_b32_e32 v4, v21, v4 -; SI-NEXT: v_or_b32_e32 v5, v19, v5 -; SI-NEXT: v_or_b32_e32 v6, v17, v6 -; SI-NEXT: v_or_b32_e32 v7, v15, v7 -; SI-NEXT: v_or_b32_e32 v8, v13, v8 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -2010,23 +1880,13 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -2035,25 +1895,25 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2061,11 +1921,11 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -2073,11 +1933,11 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -2085,12 +1945,12 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v20 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -2218,85 +2078,55 @@ define inreg <10 x i32> @bitcast_v20f16_to_v10i32_scalar(<20 x half> inreg %a, i ; SI-LABEL: bitcast_v20f16_to_v10i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_lshr_b32 s27, s25, 16 +; SI-NEXT: s_lshr_b32 s28, s24, 16 +; SI-NEXT: s_lshr_b32 s29, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: s_lshr_b32 s41, s21, 16 +; SI-NEXT: s_lshr_b32 s42, s20, 16 +; SI-NEXT: s_lshr_b32 s43, s19, 16 +; SI-NEXT: s_lshr_b32 s44, s18, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_lshr_b32 s46, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s26, 0 -; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v28, v0 -; SI-NEXT: v_or_b32_e32 v1, v26, v1 -; SI-NEXT: v_or_b32_e32 v2, v24, v2 -; SI-NEXT: v_or_b32_e32 v3, v22, v3 -; SI-NEXT: v_or_b32_e32 v4, v20, v4 -; SI-NEXT: v_or_b32_e32 v5, v18, v5 -; SI-NEXT: v_or_b32_e32 v6, v16, v6 -; SI-NEXT: v_or_b32_e32 v7, v14, v7 -; SI-NEXT: v_or_b32_e32 v8, v12, v8 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s45, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s44, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s43, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s42, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s41, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s40, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s29, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s28, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s27, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_cbranch_execnz .LBB11_4 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -2305,25 +2135,25 @@ define inreg <10 x i32> @bitcast_v20f16_to_v10i32_scalar(<20 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s44 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s43 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2331,11 +2161,11 @@ define inreg <10 x i32> @bitcast_v20f16_to_v10i32_scalar(<20 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s41 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -2343,11 +2173,11 @@ define inreg <10 x i32> @bitcast_v20f16_to_v10i32_scalar(<20 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -2355,29 +2185,40 @@ define inreg <10 x i32> @bitcast_v20f16_to_v10i32_scalar(<20 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s28 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB11_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: .LBB11_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f16_to_v10i32_scalar: ; VI: ; %bb.0: @@ -8470,171 +8311,101 @@ define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v14, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB28_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB28_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v14, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB28_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v10f32_to_v20f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v20f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 @@ -8707,144 +8478,106 @@ define inreg <20 x half> @bitcast_v10f32_to_v20f16_scalar(<10 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s26, 0 -; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: s_cbranch_scc0 .LBB29_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: s_lshr_b32 s40, s25, 16 +; SI-NEXT: s_lshr_b32 s29, s23, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB29_4 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v19, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_lshr_b64 v[10:11], v[8:9], 16 ; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s25, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v4, v14, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; SI-NEXT: v_or_b32_e32 v6, v12, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v7, v7, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_or_b32_e32 v3, v18, v3 -; SI-NEXT: v_or_b32_e32 v5, v16, v5 -; SI-NEXT: v_or_b32_e32 v8, v11, v8 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_lshr_b64 v[11:12], v[6:7], 16 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[12:13], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[13:14], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: s_branch .LBB29_5 +; SI-NEXT: .LBB29_3: +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: s_branch .LBB29_2 +; SI-NEXT: .LBB29_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v17, s29 +; SI-NEXT: v_mov_b32_e32 v16, s40 +; SI-NEXT: v_mov_b32_e32 v14, s12 +; SI-NEXT: v_mov_b32_e32 v13, s10 +; SI-NEXT: v_mov_b32_e32 v12, s8 +; SI-NEXT: v_mov_b32_e32 v11, s6 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: .LBB29_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_or_b32_e32 v2, v2, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v10f32_to_v20f16_scalar: ; VI: ; %bb.0: @@ -8984,56 +8717,26 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v10f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_mov_b32_e32 v12, v7 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_mov_b32_e32 v14, v5 +; SI-NEXT: v_mov_b32_e32 v15, v4 +; SI-NEXT: v_mov_b32_e32 v16, v3 +; SI-NEXT: v_mov_b32_e32 v17, v2 +; SI-NEXT: v_mov_b32_e32 v18, v1 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -9046,54 +8749,64 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB30_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_or_b32_e32 v2, v25, v2 -; SI-NEXT: v_or_b32_e32 v3, v23, v3 -; SI-NEXT: v_or_b32_e32 v4, v21, v4 -; SI-NEXT: v_or_b32_e32 v5, v19, v5 -; SI-NEXT: v_or_b32_e32 v6, v17, v6 -; SI-NEXT: v_or_b32_e32 v7, v15, v7 -; SI-NEXT: v_or_b32_e32 v8, v13, v8 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB30_2 -; SI-NEXT: .LBB30_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: .LBB30_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -9101,25 +8814,25 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v16 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -9127,11 +8840,11 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -9139,11 +8852,11 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -9151,12 +8864,12 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v20 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -9284,85 +8997,55 @@ define inreg <10 x float> @bitcast_v20f16_to_v10f32_scalar(<20 x half> inreg %a, ; SI-LABEL: bitcast_v20f16_to_v10f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_lshr_b32 s27, s25, 16 +; SI-NEXT: s_lshr_b32 s28, s24, 16 +; SI-NEXT: s_lshr_b32 s29, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: s_lshr_b32 s41, s21, 16 +; SI-NEXT: s_lshr_b32 s42, s20, 16 +; SI-NEXT: s_lshr_b32 s43, s19, 16 +; SI-NEXT: s_lshr_b32 s44, s18, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_lshr_b32 s46, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s26, 0 -; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: s_cbranch_scc0 .LBB31_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v0, v28, v0 -; SI-NEXT: v_or_b32_e32 v1, v26, v1 -; SI-NEXT: v_or_b32_e32 v2, v24, v2 -; SI-NEXT: v_or_b32_e32 v3, v22, v3 -; SI-NEXT: v_or_b32_e32 v4, v20, v4 -; SI-NEXT: v_or_b32_e32 v5, v18, v5 -; SI-NEXT: v_or_b32_e32 v6, v16, v6 -; SI-NEXT: v_or_b32_e32 v7, v14, v7 -; SI-NEXT: v_or_b32_e32 v8, v12, v8 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s45, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s44, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s43, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s42, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s41, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s40, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s29, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s28, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s27, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_cbranch_execnz .LBB31_4 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -9371,25 +9054,25 @@ define inreg <10 x float> @bitcast_v20f16_to_v10f32_scalar(<20 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s44 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s43 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -9397,11 +9080,11 @@ define inreg <10 x float> @bitcast_v20f16_to_v10f32_scalar(<20 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s41 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -9409,11 +9092,11 @@ define inreg <10 x float> @bitcast_v20f16_to_v10f32_scalar(<20 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -9421,29 +9104,40 @@ define inreg <10 x float> @bitcast_v20f16_to_v10f32_scalar(<20 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s28 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB31_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: .LBB31_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; SI-NEXT: s_branch .LBB31_2 +; SI-NEXT: .LBB31_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f16_to_v10f32_scalar: ; VI: ; %bb.0: @@ -14680,61 +14374,71 @@ define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v20i16_to_v20f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v22, v1, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v20, v1, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v29, v0, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v17, v1, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v24, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v14, v1, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v21, v0, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v10, v1, v39 +; SI-NEXT: v_or_b32_e32 v19, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_alignbit_b32 v23, v22, v30, 16 +; SI-NEXT: v_alignbit_b32 v25, v20, v32, 16 +; SI-NEXT: v_alignbit_b32 v26, v17, v34, 16 +; SI-NEXT: v_alignbit_b32 v27, v14, v36, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v38, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v38 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -14745,102 +14449,103 @@ define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v38 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v36 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v34 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v32 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v30 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v6, v36, v6 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v8, v39, v8 +; SI-NEXT: v_or_b32_e32 v6, v37, v6 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v23, v22, v29, 16 +; SI-NEXT: v_alignbit_b32 v25, v20, v24, 16 +; SI-NEXT: v_alignbit_b32 v26, v17, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v14, v19, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v15, 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v25 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v27 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20i16_to_v20f16: @@ -14954,144 +14659,174 @@ define inreg <20 x half> @bitcast_v20i16_to_v20f16_scalar(<20 x i16> inreg %a, i ; SI-LABEL: bitcast_v20i16_to_v20f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s58, s25, 16 +; SI-NEXT: s_lshr_b32 s63, s24, 16 +; SI-NEXT: s_lshr_b32 s57, s23, 16 +; SI-NEXT: s_lshr_b32 s62, s22, 16 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s61, s20, 16 +; SI-NEXT: s_lshr_b32 s47, s19, 16 +; SI-NEXT: s_lshr_b32 s60, s18, 16 +; SI-NEXT: s_lshr_b32 s46, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s46, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s28, s59, 16 +; SI-NEXT: s_or_b32 s29, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s47, 16 +; SI-NEXT: s_or_b32 s12, s4, s28 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s40, s60, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s56, 16 +; SI-NEXT: s_or_b32 s10, s4, s40 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s42, s61, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s57, 16 +; SI-NEXT: s_or_b32 s8, s4, s42 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s44, s62, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s58, 16 +; SI-NEXT: s_or_b32 s6, s4, s44 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s72, s63, 16 +; SI-NEXT: s_or_b32 s73, s5, s7 +; SI-NEXT: s_or_b32 s4, s4, s72 +; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[40:41], 16 +; SI-NEXT: s_mov_b32 s13, s29 +; SI-NEXT: s_lshr_b64 s[28:29], s[42:43], 16 +; SI-NEXT: s_mov_b32 s11, s41 +; SI-NEXT: s_lshr_b64 s[40:41], s[44:45], 16 +; SI-NEXT: s_mov_b32 s9, s43 +; SI-NEXT: s_lshr_b64 s[42:43], s[72:73], 16 +; SI-NEXT: s_mov_b32 s7, s45 +; SI-NEXT: s_mov_b32 s5, s73 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s6, s58, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s22, 0xffff +; SI-NEXT: s_lshl_b32 s7, s62, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s23, 0xffff +; SI-NEXT: s_lshl_b32 s8, s57, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s61, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s56, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s60, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s12, s47, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s16, 0xffff +; SI-NEXT: s_lshl_b32 s13, s59, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s17, 0xffff +; SI-NEXT: s_lshl_b32 s14, s46, 16 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s46, s13, 16 +; SI-NEXT: s_lshr_b32 s47, s11, 16 +; SI-NEXT: s_lshr_b32 s56, s9, 16 +; SI-NEXT: s_lshr_b32 s57, s7, 16 +; SI-NEXT: s_lshr_b32 s58, s5, 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v0, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_or_b32_e32 v1, v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v2, v2, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s14, s46, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s14, s26, 16 +; SI-NEXT: s_or_b32 s10, s10, s14 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s14, s47, 16 +; SI-NEXT: s_or_b32 s11, s11, s14 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s14, s28, 16 +; SI-NEXT: s_or_b32 s8, s8, s14 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s14, s56, 16 +; SI-NEXT: s_or_b32 s9, s9, s14 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 +; SI-NEXT: s_or_b32 s6, s6, s14 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s14, s57, 16 +; SI-NEXT: s_or_b32 s7, s7, s14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_or_b32 s4, s4, s14 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s14, s58, 16 +; SI-NEXT: s_or_b32 s5, s5, s14 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v20i16_to_v20f16_scalar: @@ -15265,178 +15000,138 @@ define <20 x i16> @bitcast_v20f16_to_v20i16(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v20i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v20 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_or_b32_e32 v9, v9, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v5, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v15 ; SI-NEXT: v_or_b32_e32 v3, v3, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v17 ; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v0, v0, v19 -; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_or_b32_e32 v12, v12, v17 -; SI-NEXT: v_or_b32_e32 v13, v13, v16 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_alignbit_b32 v19, v1, v19, 16 -; SI-NEXT: v_alignbit_b32 v18, v3, v18, 16 -; SI-NEXT: v_alignbit_b32 v17, v5, v17, 16 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v0, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v16 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_alignbit_b32 v10, v1, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v3, v19, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v18, 16 ; SI-NEXT: v_alignbit_b32 v16, v7, v16, 16 -; SI-NEXT: v_alignbit_b32 v14, v9, v14, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v13, 16 ; SI-NEXT: .LBB46_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v10 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v15 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v10 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v10 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; SI-NEXT: v_or_b32_e32 v4, v4, v12 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v0, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v15 -; SI-NEXT: v_or_b32_e32 v6, v6, v12 -; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -15552,149 +15247,133 @@ define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i ; SI-LABEL: bitcast_v20f16_to_v20i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 +; SI-NEXT: s_lshr_b32 s11, s25, 16 +; SI-NEXT: s_lshr_b32 s10, s24, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s15, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s26, 0 -; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: s_cbranch_scc0 .LBB47_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: s_cbranch_execnz .LBB47_4 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v20 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v20, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s23 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_or_b32_e32 v7, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 -; SI-NEXT: v_or_b32_e32 v27, v10, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v29, v11, v2 -; SI-NEXT: v_or_b32_e32 v28, v10, v4 -; SI-NEXT: v_or_b32_e32 v26, v12, v6 -; SI-NEXT: v_or_b32_e32 v24, v13, v8 +; SI-NEXT: v_or_b32_e32 v29, v10, v2 +; SI-NEXT: v_or_b32_e32 v28, v11, v4 +; SI-NEXT: v_or_b32_e32 v27, v12, v6 +; SI-NEXT: v_or_b32_e32 v26, v13, v8 ; SI-NEXT: v_lshr_b64 v[18:19], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 ; SI-NEXT: v_lshr_b64 v[14:15], v[4:5], 16 ; SI-NEXT: v_lshr_b64 v[12:13], v[6:7], 16 ; SI-NEXT: v_lshr_b64 v[10:11], v[8:9], 16 -; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_branch .LBB47_5 +; SI-NEXT: .LBB47_3: +; SI-NEXT: s_branch .LBB47_2 +; SI-NEXT: .LBB47_4: +; SI-NEXT: v_mov_b32_e32 v21, s11 +; SI-NEXT: v_mov_b32_e32 v22, s9 +; SI-NEXT: v_mov_b32_e32 v23, s8 +; SI-NEXT: v_mov_b32_e32 v24, s7 +; SI-NEXT: v_mov_b32_e32 v25, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v26, s24 +; SI-NEXT: v_mov_b32_e32 v27, s22 +; SI-NEXT: v_mov_b32_e32 v28, s20 +; SI-NEXT: v_mov_b32_e32 v29, s18 +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_mov_b32_e32 v18, s15 +; SI-NEXT: v_mov_b32_e32 v16, s14 +; SI-NEXT: v_mov_b32_e32 v14, s13 +; SI-NEXT: v_mov_b32_e32 v12, s12 +; SI-NEXT: v_mov_b32_e32 v10, s10 +; SI-NEXT: .LBB47_5: ; %end ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v20 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 @@ -15703,29 +15382,27 @@ define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v27 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v12 ; SI-NEXT: v_or_b32_e32 v6, v6, v8 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v26 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v8, v8, v10 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB47_4: -; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v20f16_to_v20i16_scalar: ; VI: ; %bb.0: @@ -22456,36 +22133,6 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v40i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill @@ -22494,356 +22141,354 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v51, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v41, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v8 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB60_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_or_b32_e32 v24, v50, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_or_b32_e32 v20, v49, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v54 -; SI-NEXT: v_or_b32_e32 v13, v53, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_or_b32_e32 v12, v52, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v41 -; SI-NEXT: v_or_b32_e32 v10, v40, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_or_b32_e32 v11, v55, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 -; SI-NEXT: v_or_b32_e32 v8, v43, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_or_b32_e32 v9, v42, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 -; SI-NEXT: v_or_b32_e32 v7, v46, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_or_b32_e32 v6, v45, v6 -; SI-NEXT: v_alignbit_b32 v26, v20, v24, 24 -; SI-NEXT: v_alignbit_b32 v30, v20, v24, 16 -; SI-NEXT: v_alignbit_b32 v32, v20, v24, 8 -; SI-NEXT: v_alignbit_b32 v25, v12, v13, 24 -; SI-NEXT: v_alignbit_b32 v27, v12, v13, 16 -; SI-NEXT: v_alignbit_b32 v31, v12, v13, 8 -; SI-NEXT: v_alignbit_b32 v19, v11, v10, 24 -; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16 -; SI-NEXT: v_alignbit_b32 v28, v11, v10, 8 -; SI-NEXT: v_alignbit_b32 v16, v9, v8, 24 -; SI-NEXT: v_alignbit_b32 v17, v9, v8, 16 -; SI-NEXT: v_alignbit_b32 v23, v9, v8, 8 -; SI-NEXT: v_alignbit_b32 v14, v6, v7, 24 -; SI-NEXT: v_alignbit_b32 v15, v6, v7, 16 -; SI-NEXT: v_alignbit_b32 v18, v6, v7, 8 -; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v6 -; SI-NEXT: v_bfe_u32 v48, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v38, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v36, v3, 8, 8 -; SI-NEXT: v_bfe_u32 v34, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v29, v1, 8, 8 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: v_or_b32_e32 v27, v1, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_or_b32_e32 v25, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v22, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_or_b32_e32 v23, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v20, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v21, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v18, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_or_b32_e32 v19, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v11, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_or_b32_e32 v17, v1, v2 +; SI-NEXT: v_alignbit_b32 v32, v25, v27, 24 +; SI-NEXT: v_alignbit_b32 v37, v25, v27, 16 +; SI-NEXT: v_alignbit_b32 v49, v25, v27, 8 +; SI-NEXT: v_alignbit_b32 v31, v23, v22, 24 +; SI-NEXT: v_alignbit_b32 v35, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v48, v23, v22, 8 +; SI-NEXT: v_alignbit_b32 v29, v21, v20, 24 +; SI-NEXT: v_alignbit_b32 v33, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v39, v21, v20, 8 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 24 +; SI-NEXT: v_alignbit_b32 v30, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v19, v18, 8 +; SI-NEXT: v_alignbit_b32 v24, v17, v11, 24 +; SI-NEXT: v_alignbit_b32 v28, v17, v11, 16 +; SI-NEXT: v_alignbit_b32 v34, v17, v11, 8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v25 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v23 +; SI-NEXT: v_lshrrev_b32_e32 v53, 8, v21 +; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v38, 8, v17 +; SI-NEXT: v_bfe_u32 v42, v16, 8, 8 +; SI-NEXT: v_bfe_u32 v40, v15, 8, 8 +; SI-NEXT: v_bfe_u32 v54, v14, 8, 8 +; SI-NEXT: v_bfe_u32 v52, v13, 8, 8 +; SI-NEXT: v_bfe_u32 v50, v12, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: .LBB60_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB60_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v45 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v11, v9, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v46 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_or_b32_e32 v17, v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_or_b32_e32 v18, v7, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v7, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v40 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v19, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v20, v5, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v55 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v21, v5, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v13, v13, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v24, v14, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_or_b32_e32 v20, v16, v14 -; SI-NEXT: v_alignbit_b32 v26, v20, v24, 24 -; SI-NEXT: v_alignbit_b32 v30, v20, v24, 16 -; SI-NEXT: v_alignbit_b32 v32, v20, v24, 8 -; SI-NEXT: v_alignbit_b32 v25, v12, v13, 24 -; SI-NEXT: v_alignbit_b32 v27, v12, v13, 16 -; SI-NEXT: v_alignbit_b32 v31, v12, v13, 8 -; SI-NEXT: v_alignbit_b32 v19, v11, v10, 24 -; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16 -; SI-NEXT: v_alignbit_b32 v28, v11, v10, 8 -; SI-NEXT: v_alignbit_b32 v16, v9, v8, 24 -; SI-NEXT: v_alignbit_b32 v17, v9, v8, 16 -; SI-NEXT: v_alignbit_b32 v23, v9, v8, 8 -; SI-NEXT: v_alignbit_b32 v14, v6, v7, 24 -; SI-NEXT: v_alignbit_b32 v15, v6, v7, 16 -; SI-NEXT: v_alignbit_b32 v18, v6, v7, 8 -; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v6 -; SI-NEXT: v_bfe_u32 v48, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v38, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v36, v3, 8, 8 -; SI-NEXT: v_bfe_u32 v34, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v29, v1, 8, 8 +; SI-NEXT: v_or_b32_e32 v22, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15 +; SI-NEXT: v_or_b32_e32 v23, v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v27, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_or_b32_e32 v25, v2, v1 +; SI-NEXT: v_alignbit_b32 v32, v25, v27, 24 +; SI-NEXT: v_alignbit_b32 v37, v25, v27, 16 +; SI-NEXT: v_alignbit_b32 v49, v25, v27, 8 +; SI-NEXT: v_alignbit_b32 v31, v23, v22, 24 +; SI-NEXT: v_alignbit_b32 v35, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v48, v23, v22, 8 +; SI-NEXT: v_alignbit_b32 v29, v21, v20, 24 +; SI-NEXT: v_alignbit_b32 v33, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v39, v21, v20, 8 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 24 +; SI-NEXT: v_alignbit_b32 v30, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v19, v18, 8 +; SI-NEXT: v_alignbit_b32 v24, v17, v11, 24 +; SI-NEXT: v_alignbit_b32 v28, v17, v11, 16 +; SI-NEXT: v_alignbit_b32 v34, v17, v11, 8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v25 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v23 +; SI-NEXT: v_lshrrev_b32_e32 v53, 8, v21 +; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v38, 8, v17 +; SI-NEXT: v_bfe_u32 v42, v16, 8, 8 +; SI-NEXT: v_bfe_u32 v40, v15, 8, 8 +; SI-NEXT: v_bfe_u32 v54, v14, 8, 8 +; SI-NEXT: v_bfe_u32 v52, v13, 8, 8 +; SI-NEXT: v_bfe_u32 v50, v12, 8, 8 ; SI-NEXT: .LBB60_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 -; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; SI-NEXT: v_or_b32_e32 v24, v24, v32 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_or_b32_e32 v26, v26, v30 -; SI-NEXT: v_or_b32_e32 v24, v24, v26 -; SI-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v39 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v20, v20, v24 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v48 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v5, v24, v5 -; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v31 -; SI-NEXT: v_or_b32_e32 v5, v5, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v25 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_or_b32_e32 v5, v5, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v5, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v37 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v5, v5, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v38 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v4, v12, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v28 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v19 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v35 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v36 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v23 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v33 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v34 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v18 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v14 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v21 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v29 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload @@ -23616,327 +23261,328 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-LABEL: bitcast_v20f16_to_v40i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s20 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v12, s30, 0 +; SI-NEXT: v_writelane_b32 v12, s31, 1 +; SI-NEXT: v_writelane_b32 v12, s34, 2 +; SI-NEXT: v_writelane_b32 v12, s35, 3 +; SI-NEXT: s_lshr_b32 s34, s25, 16 +; SI-NEXT: s_lshr_b32 s35, s24, 16 +; SI-NEXT: s_lshr_b32 s30, s23, 16 +; SI-NEXT: s_lshr_b32 s31, s22, 16 +; SI-NEXT: s_lshr_b32 s94, s21, 16 +; SI-NEXT: s_lshr_b32 s95, s20, 16 +; SI-NEXT: s_lshr_b32 s92, s19, 16 +; SI-NEXT: s_lshr_b32 s93, s18, 16 +; SI-NEXT: s_lshr_b32 s90, s17, 16 +; SI-NEXT: s_lshr_b32 s91, s16, 16 +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB61_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s91, 16 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s90, 16 +; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s93, 16 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s92, 16 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s95, 16 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s94, 16 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s31, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s30, 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 24 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s35, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s15, s34, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; SI-NEXT: s_or_b32 s5, s5, s15 +; SI-NEXT: s_lshr_b32 s43, s13, 8 +; SI-NEXT: s_lshr_b32 s41, s11, 8 +; SI-NEXT: s_lshr_b32 s29, s9, 8 +; SI-NEXT: s_lshr_b32 s27, s7, 8 +; SI-NEXT: s_lshr_b32 s15, s5, 8 +; SI-NEXT: s_bfe_u32 s45, s90, 0x80008 +; SI-NEXT: s_bfe_u32 s47, s92, 0x80008 +; SI-NEXT: s_bfe_u32 s57, s94, 0x80008 +; SI-NEXT: s_bfe_u32 s59, s30, 0x80008 +; SI-NEXT: s_bfe_u32 s61, s34, 0x80008 +; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[4:5], 8 +; SI-NEXT: s_cbranch_execnz .LBB61_4 +; SI-NEXT: .LBB61_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v2, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s34 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_readfirstlane_b32 s5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s31 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v20 -; SI-NEXT: s_cmp_lg_u32 s26, 0 -; SI-NEXT: s_cbranch_scc0 .LBB61_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readfirstlane_b32 s4, v8 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v7 -; SI-NEXT: s_or_b32 s12, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v5 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v6 -; SI-NEXT: s_or_b32 s13, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v11 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v10 -; SI-NEXT: s_or_b32 s10, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v9 -; SI-NEXT: s_or_b32 s11, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v14 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v13 -; SI-NEXT: s_or_b32 s8, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v3 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v12 -; SI-NEXT: s_or_b32 s9, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v19 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v17 -; SI-NEXT: s_or_b32 s6, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v15 -; SI-NEXT: s_or_b32 s7, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v25 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v23 -; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 24 +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s22 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s5, v1 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_readfirstlane_b32 s15, v21 -; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[18:19], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[22:23], s[10:11], 16 -; SI-NEXT: s_or_b32 s5, s15, s5 -; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 -; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 8 -; SI-NEXT: s_lshr_b32 s23, s13, 8 -; SI-NEXT: s_lshr_b32 s21, s11, 8 -; SI-NEXT: s_lshr_b32 s19, s9, 8 -; SI-NEXT: s_lshr_b32 s17, s7, 8 -; SI-NEXT: s_lshr_b32 s15, s5, 8 -; SI-NEXT: v_bfe_u32 v24, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v22, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v20, v3, 8, 8 -; SI-NEXT: v_bfe_u32 v18, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v16, v1, 8, 8 -; SI-NEXT: s_cbranch_execnz .LBB61_3 -; SI-NEXT: .LBB61_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_readfirstlane_b32 s4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 -; SI-NEXT: v_readfirstlane_b32 s5, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s6, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s5, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readfirstlane_b32 s6, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s30 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_readfirstlane_b32 s7, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s23 +; SI-NEXT: v_readfirstlane_b32 s7, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s95 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_readfirstlane_b32 s7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: v_readfirstlane_b32 s8, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_readfirstlane_b32 s8, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s94 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_readfirstlane_b32 s9, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 +; SI-NEXT: v_readfirstlane_b32 s9, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s93 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_readfirstlane_b32 s9, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: v_readfirstlane_b32 s10, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: v_readfirstlane_b32 s10, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s92 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: v_readfirstlane_b32 s11, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_readfirstlane_b32 s11, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s91 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_readfirstlane_b32 s11, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: v_readfirstlane_b32 s12, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_readfirstlane_b32 s11, v4 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: v_readfirstlane_b32 s12, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v7 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readfirstlane_b32 s12, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: v_readfirstlane_b32 s13, v7 -; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_readfirstlane_b32 s13, v7 ; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v5 ; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[18:19], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[22:23], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 -; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 8 -; SI-NEXT: s_lshr_b32 s23, s13, 8 -; SI-NEXT: s_lshr_b32 s21, s11, 8 -; SI-NEXT: s_lshr_b32 s19, s9, 8 -; SI-NEXT: s_lshr_b32 s17, s7, 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s43, s13, 8 +; SI-NEXT: s_lshr_b32 s41, s11, 8 +; SI-NEXT: s_lshr_b32 s29, s9, 8 +; SI-NEXT: s_lshr_b32 s27, s7, 8 ; SI-NEXT: s_lshr_b32 s15, s5, 8 -; SI-NEXT: v_bfe_u32 v24, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v22, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v20, v3, 8, 8 -; SI-NEXT: v_bfe_u32 v18, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v16, v1, 8, 8 -; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: v_bfe_u32 v10, v7, 8, 8 +; SI-NEXT: v_bfe_u32 v9, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v8, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v6, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v5, v1, 8, 8 +; SI-NEXT: s_branch .LBB61_5 +; SI-NEXT: .LBB61_3: +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: s_branch .LBB61_2 +; SI-NEXT: .LBB61_4: +; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v2, s30 +; SI-NEXT: v_mov_b32_e32 v3, s94 +; SI-NEXT: v_mov_b32_e32 v4, s92 +; SI-NEXT: v_mov_b32_e32 v7, s90 +; SI-NEXT: v_mov_b32_e32 v5, s61 +; SI-NEXT: v_mov_b32_e32 v6, s59 +; SI-NEXT: v_mov_b32_e32 v8, s57 +; SI-NEXT: v_mov_b32_e32 v9, s47 +; SI-NEXT: v_mov_b32_e32 v10, s45 +; SI-NEXT: .LBB61_5: ; %end ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s20, s20, 8 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_or_b32 s12, s12, s20 +; SI-NEXT: s_lshl_b32 s16, s40, 8 +; SI-NEXT: s_or_b32 s12, s12, s16 +; SI-NEXT: s_and_b32 s16, s26, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s14, s14, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v11, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xff -; SI-NEXT: s_lshl_b32 s13, s23, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s43, 8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v5, s12, v5 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_or_b32_e32 v7, s12, v7 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: s_lshl_b32 s12, s26, 8 +; SI-NEXT: s_lshl_b32 s12, s58, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: s_and_b32 s12, s22, 0xff +; SI-NEXT: s_and_b32 s12, s42, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s13, s18, 24 +; SI-NEXT: s_lshl_b32 s13, s28, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 8, v0 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 +; SI-NEXT: v_mov_b32_e32 v10, s10 ; SI-NEXT: s_and_b32 s10, s11, 0xff -; SI-NEXT: s_lshl_b32 s11, s21, 8 +; SI-NEXT: s_lshl_b32 s11, s41, 8 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v10, v7, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v9 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v4, v7, v4 ; SI-NEXT: v_or_b32_e32 v4, s10, v4 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s10, s40, 8 +; SI-NEXT: s_lshl_b32 s10, s60, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xff +; SI-NEXT: s_and_b32 s10, s46, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s11, s24, 24 +; SI-NEXT: s_lshl_b32 s11, s44, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v7, s8 ; SI-NEXT: s_and_b32 s8, s9, 0xff -; SI-NEXT: s_lshl_b32 s9, s19, 8 +; SI-NEXT: s_lshl_b32 s9, s29, 8 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v3, s8, v3 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s8, s44, 8 +; SI-NEXT: s_lshl_b32 s8, s76, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: s_and_b32 s8, s42, 0xff +; SI-NEXT: s_and_b32 s8, s72, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s46, 24 +; SI-NEXT: s_lshl_b32 s9, s56, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 @@ -23946,21 +23592,21 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_mov_b32_e32 v4, s6 ; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: s_lshl_b32 s7, s17, 8 +; SI-NEXT: s_lshl_b32 s7, s27, 8 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v2, s6, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s6, s60, 8 +; SI-NEXT: s_lshl_b32 s6, s78, 8 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s58, 0xff +; SI-NEXT: s_and_b32 s6, s74, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s56, 24 +; SI-NEXT: s_lshl_b32 s7, s62, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 @@ -23975,46 +23621,21 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v5 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s35, v12, 3 +; SI-NEXT: v_readlane_b32 s34, v12, 2 +; SI-NEXT: v_readlane_b32 s31, v12, 1 +; SI-NEXT: v_readlane_b32 s30, v12, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB61_4: -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: s_branch .LBB61_2 ; ; VI-LABEL: bitcast_v20f16_to_v40i8_scalar: ; VI: ; %bb.0: @@ -24753,129 +24374,150 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v19 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v29 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v17 ; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v25 -; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v31 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v35 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v34 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v35 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v57 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v41 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v36, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v29, v0, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v0, v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v0, v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v0, v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v0, v0, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v0, v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v0, v0, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v0, v0, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 -; SI-NEXT: v_or_b32_e32 v0, v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 -; SI-NEXT: v_or_b32_e32 v0, v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v10, v38, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v31, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v39, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v21, v0, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v18, v56, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v48 +; SI-NEXT: v_or_b32_e32 v11, v52, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v7, v7, v45 +; SI-NEXT: v_or_b32_e32 v14, v47, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v20, v59, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v44 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v13, v46, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v51 +; SI-NEXT: v_alignbit_b32 v27, v7, v13, 16 +; SI-NEXT: v_or_b32_e32 v9, v9, v58 +; SI-NEXT: v_or_b32_e32 v13, v0, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 -; SI-NEXT: v_or_b32_e32 v0, v0, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v0, v0, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v52 -; SI-NEXT: v_or_b32_e32 v0, v0, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 -; SI-NEXT: v_or_b32_e32 v0, v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v9, v9, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v17, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v19, v3, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v11, 16 +; SI-NEXT: v_alignbit_b32 v35, v9, v20, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -24893,191 +24535,200 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: .LBB62_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v51 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v59, v1 -; SI-NEXT: v_add_i32_e32 v35, vcc, 0x300, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 -; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v47, v1 -; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v46, v1 -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v43 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v59, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v45, v1 -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v50 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v56, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 ; SI-NEXT: v_or_b32_e32 v1, v44, v1 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v43, v1 -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v42, v1 -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v46, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v47, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v1, v37, v1 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v31, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v0 +; SI-NEXT: v_alignbit_b32 v17, v1, v29, 16 +; SI-NEXT: v_alignbit_b32 v19, v3, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v15, 16 +; SI-NEXT: v_alignbit_b32 v27, v7, v13, 16 +; SI-NEXT: v_alignbit_b32 v35, v9, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 ; SI-NEXT: .LBB62_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v34 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v55 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v41 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -26032,301 +25683,365 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; SI-LABEL: bitcast_v40i8_to_v20f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v27, s30, 0 +; SI-NEXT: v_writelane_b32 v27, s31, 1 +; SI-NEXT: v_writelane_b32 v27, s34, 2 +; SI-NEXT: v_writelane_b32 v27, s35, 3 +; SI-NEXT: v_writelane_b32 v27, s36, 4 +; SI-NEXT: v_writelane_b32 v27, s37, 5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: v_readfirstlane_b32 s62, v25 -; SI-NEXT: v_readfirstlane_b32 s63, v24 -; SI-NEXT: v_readfirstlane_b32 s60, v23 -; SI-NEXT: v_readfirstlane_b32 s61, v22 -; SI-NEXT: v_readfirstlane_b32 s58, v21 -; SI-NEXT: v_readfirstlane_b32 s59, v20 -; SI-NEXT: v_readfirstlane_b32 s56, v19 -; SI-NEXT: v_readfirstlane_b32 s57, v18 -; SI-NEXT: v_readfirstlane_b32 s46, v17 -; SI-NEXT: v_readfirstlane_b32 s47, v16 -; SI-NEXT: v_readfirstlane_b32 s44, v15 -; SI-NEXT: v_readfirstlane_b32 s45, v14 -; SI-NEXT: v_readfirstlane_b32 s42, v13 -; SI-NEXT: v_readfirstlane_b32 s43, v12 -; SI-NEXT: v_readfirstlane_b32 s15, v11 -; SI-NEXT: v_readfirstlane_b32 s41, v10 -; SI-NEXT: v_readfirstlane_b32 s12, v9 -; SI-NEXT: v_readfirstlane_b32 s14, v8 -; SI-NEXT: v_readfirstlane_b32 s8, v7 -; SI-NEXT: v_readfirstlane_b32 s11, v6 -; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: v_writelane_b32 v27, s38, 6 +; SI-NEXT: v_readfirstlane_b32 s91, v25 +; SI-NEXT: v_readfirstlane_b32 s90, v24 +; SI-NEXT: v_readfirstlane_b32 s94, v23 +; SI-NEXT: v_readfirstlane_b32 s95, v22 +; SI-NEXT: v_readfirstlane_b32 s31, v21 +; SI-NEXT: v_readfirstlane_b32 s30, v20 +; SI-NEXT: v_readfirstlane_b32 s34, v19 +; SI-NEXT: v_readfirstlane_b32 s35, v18 +; SI-NEXT: v_readfirstlane_b32 s75, v17 +; SI-NEXT: v_readfirstlane_b32 s74, v16 +; SI-NEXT: v_readfirstlane_b32 s78, v15 +; SI-NEXT: v_readfirstlane_b32 s79, v14 +; SI-NEXT: v_readfirstlane_b32 s89, v13 +; SI-NEXT: v_readfirstlane_b32 s88, v12 +; SI-NEXT: v_readfirstlane_b32 s92, v11 +; SI-NEXT: v_readfirstlane_b32 s93, v10 +; SI-NEXT: v_readfirstlane_b32 s61, v9 +; SI-NEXT: v_readfirstlane_b32 s60, v8 +; SI-NEXT: v_readfirstlane_b32 s62, v7 +; SI-NEXT: v_readfirstlane_b32 s63, v6 +; SI-NEXT: v_readfirstlane_b32 s73, v5 +; SI-NEXT: v_readfirstlane_b32 s72, v4 +; SI-NEXT: v_readfirstlane_b32 s76, v3 +; SI-NEXT: v_readfirstlane_b32 s77, v2 +; SI-NEXT: v_readfirstlane_b32 s58, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s40, v4 -; SI-NEXT: v_readfirstlane_b32 s10, v3 -; SI-NEXT: v_readfirstlane_b32 s13, v2 -; SI-NEXT: v_readfirstlane_b32 s7, v1 -; SI-NEXT: v_readfirstlane_b32 s9, v0 +; SI-NEXT: v_readfirstlane_b32 s59, v0 +; SI-NEXT: v_writelane_b32 v27, s39, 7 ; SI-NEXT: s_cbranch_scc0 .LBB63_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_or_b32 s8, s4, s5 ; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s19, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s23, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s5, s27, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s5, s7, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: s_lshl_b32 s5, s10, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: s_lshl_b32 s5, s6, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s5, s8, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: s_lshl_b32 s5, s12, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xff -; SI-NEXT: s_lshl_b32 s5, s15, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s42, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: s_lshl_b32 s5, s44, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s47, 0xff -; SI-NEXT: s_lshl_b32 s5, s46, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_and_b32 s4, s57, 0xff -; SI-NEXT: s_lshl_b32 s5, s56, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_and_b32 s4, s59, 0xff -; SI-NEXT: s_lshl_b32 s5, s58, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_and_b32 s4, s61, 0xff -; SI-NEXT: s_lshl_b32 s5, s60, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_and_b32 s4, s63, 0xff -; SI-NEXT: s_lshl_b32 s5, s62, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: s_lshl_b32 s6, s25, 8 +; SI-NEXT: s_or_b32 s9, s5, s6 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s27, 24 +; SI-NEXT: s_or_b32 s6, s6, s5 +; SI-NEXT: s_and_b32 s5, s77, 0xff +; SI-NEXT: s_lshl_b32 s7, s76, 8 +; SI-NEXT: s_or_b32 s10, s5, s7 +; SI-NEXT: s_and_b32 s5, s72, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s73, 24 +; SI-NEXT: s_or_b32 s42, s7, s5 +; SI-NEXT: s_and_b32 s5, s93, 0xff +; SI-NEXT: s_lshl_b32 s7, s92, 8 +; SI-NEXT: s_or_b32 s11, s5, s7 +; SI-NEXT: s_and_b32 s5, s88, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s89, 24 +; SI-NEXT: s_or_b32 s44, s7, s5 +; SI-NEXT: s_and_b32 s5, s35, 0xff +; SI-NEXT: s_lshl_b32 s7, s34, 8 +; SI-NEXT: s_or_b32 s12, s5, s7 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s7, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s13, s23, 24 +; SI-NEXT: s_or_b32 s56, s13, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s13, s29, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s59, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s14, s58, 24 +; SI-NEXT: s_or_b32 s57, s14, s13 +; SI-NEXT: s_and_b32 s13, s63, 0xff +; SI-NEXT: s_lshl_b32 s14, s62, 8 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s60, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s61, 24 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 vcc_lo, s15, s14 +; SI-NEXT: s_or_b32 s43, s13, vcc_lo +; SI-NEXT: s_and_b32 s13, s79, 0xff +; SI-NEXT: s_lshl_b32 s14, s78, 8 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s74, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s75, 24 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 vcc_hi, s15, s14 +; SI-NEXT: s_or_b32 s45, s13, vcc_hi +; SI-NEXT: s_and_b32 s13, s95, 0xff +; SI-NEXT: s_lshl_b32 s14, s94, 8 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s90, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s91, 24 +; SI-NEXT: s_or_b32 s36, s15, s14 +; SI-NEXT: s_and_b32 s14, s30, 0xff +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s31, 24 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s56 +; SI-NEXT: s_or_b32 s7, s7, s57 +; SI-NEXT: s_or_b32 s46, s15, s14 +; SI-NEXT: s_or_b32 s47, s13, s36 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_and_b32 s37, s10, 0xffff +; SI-NEXT: s_and_b32 s38, s11, 0xffff +; SI-NEXT: s_and_b32 s39, s12, 0xffff +; SI-NEXT: s_or_b32 s12, s8, s4 +; SI-NEXT: s_mov_b32 s13, s5 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 +; SI-NEXT: s_or_b32 s10, s9, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 16 +; SI-NEXT: s_or_b32 s8, s37, s42 +; SI-NEXT: s_mov_b32 s9, s43 +; SI-NEXT: s_lshr_b64 s[42:43], s[42:43], 16 +; SI-NEXT: s_or_b32 s6, s38, s44 +; SI-NEXT: s_mov_b32 s7, s45 +; SI-NEXT: s_lshr_b64 s[44:45], s[44:45], 16 +; SI-NEXT: s_or_b32 s4, s39, s46 +; SI-NEXT: s_mov_b32 s5, s47 +; SI-NEXT: s_lshr_b64 s[46:47], s[46:47], 16 +; SI-NEXT: s_lshr_b32 s41, s56, 16 +; SI-NEXT: s_lshr_b32 s43, s57, 16 +; SI-NEXT: s_lshr_b32 s45, vcc_lo, 16 +; SI-NEXT: s_lshr_b32 s47, vcc_hi, 16 +; SI-NEXT: s_lshr_b32 s15, s36, 16 ; SI-NEXT: s_cbranch_execnz .LBB63_3 ; SI-NEXT: .LBB63_2: ; %cmp.true -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_and_b32 s11, s11, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_or_b32 s8, s8, s11 -; SI-NEXT: s_and_b32 s11, s40, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 8 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_or_b32 s6, s6, s11 -; SI-NEXT: s_and_b32 s11, s13, 0xff -; SI-NEXT: s_lshl_b32 s10, s10, 8 -; SI-NEXT: s_and_b32 s9, s9, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s28, 0xff -; SI-NEXT: s_lshl_b32 s11, s29, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s35, s35, 3 +; SI-NEXT: s_and_b32 s4, s35, 0xff +; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_add_i32 s30, s30, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s30, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s31, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s95, s95, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s95, 0xff +; SI-NEXT: s_lshl_b32 s6, s94, 8 +; SI-NEXT: s_add_i32 s90, s90, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s90, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s91, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s93, s93, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s93, 0xff +; SI-NEXT: s_lshl_b32 s7, s92, 8 +; SI-NEXT: s_add_i32 s88, s88, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s88, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s89, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s79, s79, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s79, 0xff +; SI-NEXT: s_lshl_b32 s8, s78, 8 +; SI-NEXT: s_add_i32 s74, s74, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s9, s74, 0xff +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_lshl_b32 s8, s75, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_add_i32 s77, s77, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s77, 0xff +; SI-NEXT: s_lshl_b32 s9, s76, 8 +; SI-NEXT: s_add_i32 s72, s72, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s10, s72, 0xff +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_lshl_b32 s9, s73, 24 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_add_i32 s63, s63, 3 -; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s12, s12, 8 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s11, s26, 0xff -; SI-NEXT: s_lshl_b32 s13, s27, 8 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s63, 0xff +; SI-NEXT: s_lshl_b32 s10, s62, 8 +; SI-NEXT: s_add_i32 s60, s60, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s11, s60, 0xff +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_lshl_b32 s10, s61, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_and_b32 s4, s63, 0xff -; SI-NEXT: s_lshl_b32 s5, s62, 8 -; SI-NEXT: s_add_i32 s61, s61, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s24, 0xff +; SI-NEXT: s_lshl_b32 s11, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s12, s26, 0xff +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_lshl_b32 s11, s27, 24 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s28, 0xff +; SI-NEXT: s_lshl_b32 s12, s29, 8 ; SI-NEXT: s_add_i32 s59, s59, 3 -; SI-NEXT: s_add_i32 s57, s57, 3 -; SI-NEXT: s_add_i32 s47, s47, 3 -; SI-NEXT: s_add_i32 s45, s45, 3 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: s_or_b32 s11, s13, s11 -; SI-NEXT: s_and_b32 s13, s24, 0xff -; SI-NEXT: s_lshl_b32 s14, s25, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s13, s59, 0xff +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_lshl_b32 s12, s58, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s61, 0xff -; SI-NEXT: s_lshl_b32 s60, s60, 8 -; SI-NEXT: s_and_b32 s59, s59, 0xff -; SI-NEXT: s_lshl_b32 s58, s58, 8 -; SI-NEXT: s_and_b32 s57, s57, 0xff -; SI-NEXT: s_lshl_b32 s56, s56, 8 -; SI-NEXT: s_and_b32 s47, s47, 0xff -; SI-NEXT: s_lshl_b32 s46, s46, 8 -; SI-NEXT: s_and_b32 s45, s45, 0xff -; SI-NEXT: s_lshl_b32 s44, s44, 8 -; SI-NEXT: s_and_b32 s43, s43, 0xff -; SI-NEXT: s_lshl_b32 s42, s42, 8 -; SI-NEXT: s_and_b32 s41, s41, 0xff -; SI-NEXT: s_lshl_b32 s15, s15, 8 -; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: s_and_b32 s14, s22, 0xff -; SI-NEXT: s_lshl_b32 s22, s23, 8 -; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s21, s21, 8 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 8 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_or_b32 s5, s60, s5 -; SI-NEXT: s_or_b32 s58, s58, s59 -; SI-NEXT: s_or_b32 s56, s56, s57 -; SI-NEXT: s_or_b32 s46, s46, s47 -; SI-NEXT: s_or_b32 s44, s44, s45 -; SI-NEXT: s_or_b32 s42, s42, s43 -; SI-NEXT: s_or_b32 s15, s15, s41 -; SI-NEXT: s_or_b32 s14, s22, s14 -; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_addk_i32 s58, 0x300 -; SI-NEXT: s_addk_i32 s56, 0x300 -; SI-NEXT: s_addk_i32 s46, 0x300 -; SI-NEXT: s_addk_i32 s44, 0x300 -; SI-NEXT: s_addk_i32 s42, 0x300 -; SI-NEXT: s_addk_i32 s15, 0x300 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s16, 0xff +; SI-NEXT: s_lshl_b32 s13, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s14, s18, 0xff ; SI-NEXT: s_addk_i32 s12, 0x300 -; SI-NEXT: s_addk_i32 s8, 0x300 -; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_addk_i32 s10, 0x300 -; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: s_addk_i32 s9, 0x300 -; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_lshl_b32 s13, s19, 24 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s20, 0xff +; SI-NEXT: s_lshl_b32 s14, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s15, s22, 0xff ; SI-NEXT: s_addk_i32 s13, 0x300 -; SI-NEXT: s_addk_i32 s14, 0x300 -; SI-NEXT: s_addk_i32 s20, 0x300 -; SI-NEXT: s_addk_i32 s18, 0x300 -; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshl_b32 s14, s23, 24 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s12, s12, 0x3000000 +; SI-NEXT: s_add_i32 s13, s13, 0x3000000 +; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s41, s13, 16 +; SI-NEXT: s_lshr_b32 s43, s11, 16 +; SI-NEXT: s_lshr_b32 s45, s9, 16 +; SI-NEXT: s_lshr_b32 s47, s7, 16 +; SI-NEXT: s_lshr_b32 s15, s5, 16 ; SI-NEXT: .LBB63_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v17 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s14, s41, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 +; SI-NEXT: s_or_b32 s10, s10, s14 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s14, s43, 16 +; SI-NEXT: s_or_b32 s11, s11, s14 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_or_b32 s8, s8, s14 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s14, s45, 16 +; SI-NEXT: s_or_b32 s9, s9, s14 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s14, s44, 16 +; SI-NEXT: s_or_b32 s6, s6, s14 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s14, s47, 16 +; SI-NEXT: s_or_b32 s7, s7, s14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s14, s46, 16 +; SI-NEXT: s_or_b32 s4, s4, s14 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s14, s15, 16 +; SI-NEXT: s_or_b32 s5, s5, s14 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s5 +; SI-NEXT: v_readlane_b32 s39, v27, 7 +; SI-NEXT: v_readlane_b32 s38, v27, 6 +; SI-NEXT: v_readlane_b32 s37, v27, 5 +; SI-NEXT: v_readlane_b32 s36, v27, 4 +; SI-NEXT: v_readlane_b32 s35, v27, 3 +; SI-NEXT: v_readlane_b32 s34, v27, 2 +; SI-NEXT: v_readlane_b32 s31, v27, 1 +; SI-NEXT: v_readlane_b32 s30, v27, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB63_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: s_branch .LBB63_2 ; ; VI-LABEL: bitcast_v40i8_to_v20f16_scalar: @@ -27135,56 +26850,26 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v5f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_mov_b32_e32 v16, v9 +; SI-NEXT: v_mov_b32_e32 v17, v8 +; SI-NEXT: v_mov_b32_e32 v18, v7 +; SI-NEXT: v_mov_b32_e32 v19, v6 +; SI-NEXT: v_mov_b32_e32 v20, v5 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v22, v3 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v24, v1 +; SI-NEXT: v_mov_b32_e32 v25, v0 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -27197,36 +26882,36 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB64_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v30, v2 -; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_or_b32_e32 v4, v26, v4 -; SI-NEXT: v_or_b32_e32 v5, v24, v5 -; SI-NEXT: v_or_b32_e32 v6, v22, v6 -; SI-NEXT: v_or_b32_e32 v7, v20, v7 -; SI-NEXT: v_or_b32_e32 v8, v18, v8 -; SI-NEXT: v_or_b32_e32 v9, v16, v9 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v30 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v28 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v27 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 @@ -27237,13 +26922,23 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_2 ; SI-NEXT: .LBB64_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -27256,10 +26951,10 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -27267,10 +26962,10 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -27278,11 +26973,11 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v30 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -27290,11 +26985,11 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -27302,11 +26997,11 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 @@ -27435,124 +27130,106 @@ define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, ; SI-LABEL: bitcast_v20f16_to_v5f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v10 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s36, 0 +; SI-NEXT: v_writelane_b32 v16, s37, 1 +; SI-NEXT: v_writelane_b32 v16, s38, 2 +; SI-NEXT: v_writelane_b32 v16, s39, 3 +; SI-NEXT: v_writelane_b32 v16, s48, 4 +; SI-NEXT: v_writelane_b32 v16, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: s_lshr_b32 s7, s24, 16 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s22, 16 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s11, s20, 16 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s18, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b32 s15, s16, 16 +; SI-NEXT: v_writelane_b32 v16, s50, 6 ; SI-NEXT: s_cmp_lg_u32 s26, 0 -; SI-NEXT: s_cbranch_scc0 .LBB65_4 +; SI-NEXT: v_writelane_b32 v16, s51, 7 +; SI-NEXT: s_cbranch_scc0 .LBB65_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v30, v2 -; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_or_b32_e32 v4, v26, v4 -; SI-NEXT: v_or_b32_e32 v5, v24, v5 -; SI-NEXT: v_or_b32_e32 v6, v22, v6 -; SI-NEXT: v_or_b32_e32 v7, v20, v7 -; SI-NEXT: v_or_b32_e32 v8, v18, v8 -; SI-NEXT: v_or_b32_e32 v9, v16, v9 -; SI-NEXT: s_cbranch_execnz .LBB65_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s37, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB65_4 ; SI-NEXT: .LBB65_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s13 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s12 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s9 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -27560,11 +27237,11 @@ define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -27572,12 +27249,12 @@ define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -27590,11 +27267,41 @@ define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: .LBB65_3: ; %end -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB65_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB65_5 +; SI-NEXT: .LBB65_3: +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB65_2 +; SI-NEXT: .LBB65_4: +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_mov_b32_e32 v14, s50 +; SI-NEXT: v_mov_b32_e32 v15, s51 +; SI-NEXT: .LBB65_5: ; %end +; SI-NEXT: v_readlane_b32 s51, v16, 7 +; SI-NEXT: v_readlane_b32 s50, v16, 6 +; SI-NEXT: v_readlane_b32 s49, v16, 5 +; SI-NEXT: v_readlane_b32 s48, v16, 4 +; SI-NEXT: v_readlane_b32 s39, v16, 3 +; SI-NEXT: v_readlane_b32 s38, v16, 2 +; SI-NEXT: v_readlane_b32 s37, v16, 1 +; SI-NEXT: v_readlane_b32 s36, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f16_to_v5f64_scalar: ; VI: ; %bb.0: @@ -27777,65 +27484,30 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v13, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB66_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_4 @@ -27843,80 +27515,50 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v13, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB66_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v11 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f64_to_v20f16: @@ -27995,139 +27637,101 @@ define inreg <20 x half> @bitcast_v5f64_to_v20f16_scalar(<5 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s26, 0 -; SI-NEXT: s_cbranch_scc0 .LBB67_4 +; SI-NEXT: s_cbranch_scc0 .LBB67_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: s_cbranch_execnz .LBB67_3 +; SI-NEXT: s_lshr_b32 s40, s25, 16 +; SI-NEXT: s_lshr_b32 s29, s23, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB67_4 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[0:1], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[14:15], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[5:6], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: .LBB67_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_or_b32_e32 v0, v18, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v2, v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 -; SI-NEXT: v_or_b32_e32 v4, v14, v4 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; SI-NEXT: v_or_b32_e32 v5, v5, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v7, v7, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_or_b32_e32 v3, v18, v3 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v8, v11, v8 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_lshr_b64 v[10:11], v[8:9], 16 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[11:12], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[12:13], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: s_branch .LBB67_5 +; SI-NEXT: .LBB67_3: +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: s_branch .LBB67_2 +; SI-NEXT: .LBB67_4: +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v16, s40 +; SI-NEXT: v_mov_b32_e32 v17, s29 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v14, s12 +; SI-NEXT: v_mov_b32_e32 v13, s10 +; SI-NEXT: v_mov_b32_e32 v12, s8 +; SI-NEXT: v_mov_b32_e32 v11, s6 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: .LBB67_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_or_b32_e32 v2, v2, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB67_4: -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: s_branch .LBB67_2 ; ; VI-LABEL: bitcast_v5f64_to_v20f16_scalar: ; VI: ; %bb.0: @@ -28252,56 +27856,26 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v20f16_to_v5i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_mov_b32_e32 v16, v9 +; SI-NEXT: v_mov_b32_e32 v17, v8 +; SI-NEXT: v_mov_b32_e32 v18, v7 +; SI-NEXT: v_mov_b32_e32 v19, v6 +; SI-NEXT: v_mov_b32_e32 v20, v5 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v22, v3 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v24, v1 +; SI-NEXT: v_mov_b32_e32 v25, v0 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -28314,36 +27888,36 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB68_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v30, v2 -; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_or_b32_e32 v4, v26, v4 -; SI-NEXT: v_or_b32_e32 v5, v24, v5 -; SI-NEXT: v_or_b32_e32 v6, v22, v6 -; SI-NEXT: v_or_b32_e32 v7, v20, v7 -; SI-NEXT: v_or_b32_e32 v8, v18, v8 -; SI-NEXT: v_or_b32_e32 v9, v16, v9 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v30 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v28 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v27 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 @@ -28354,13 +27928,23 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB68_2 ; SI-NEXT: .LBB68_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -28373,10 +27957,10 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -28384,10 +27968,10 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -28395,11 +27979,11 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v30 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -28407,11 +27991,11 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -28419,11 +28003,11 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 @@ -28552,124 +28136,106 @@ define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 ; SI-LABEL: bitcast_v20f16_to_v5i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v10 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s36, 0 +; SI-NEXT: v_writelane_b32 v16, s37, 1 +; SI-NEXT: v_writelane_b32 v16, s38, 2 +; SI-NEXT: v_writelane_b32 v16, s39, 3 +; SI-NEXT: v_writelane_b32 v16, s48, 4 +; SI-NEXT: v_writelane_b32 v16, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: s_lshr_b32 s7, s24, 16 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s22, 16 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s11, s20, 16 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s18, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b32 s15, s16, 16 +; SI-NEXT: v_writelane_b32 v16, s50, 6 ; SI-NEXT: s_cmp_lg_u32 s26, 0 -; SI-NEXT: s_cbranch_scc0 .LBB69_4 +; SI-NEXT: v_writelane_b32 v16, s51, 7 +; SI-NEXT: s_cbranch_scc0 .LBB69_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v30, v2 -; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_or_b32_e32 v4, v26, v4 -; SI-NEXT: v_or_b32_e32 v5, v24, v5 -; SI-NEXT: v_or_b32_e32 v6, v22, v6 -; SI-NEXT: v_or_b32_e32 v7, v20, v7 -; SI-NEXT: v_or_b32_e32 v8, v18, v8 -; SI-NEXT: v_or_b32_e32 v9, v16, v9 -; SI-NEXT: s_cbranch_execnz .LBB69_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s37, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB69_4 ; SI-NEXT: .LBB69_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s13 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s12 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s9 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -28677,11 +28243,11 @@ define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -28689,12 +28255,12 @@ define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -28707,11 +28273,41 @@ define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: .LBB69_3: ; %end -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB69_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB69_5 +; SI-NEXT: .LBB69_3: +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB69_2 +; SI-NEXT: .LBB69_4: +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_mov_b32_e32 v14, s50 +; SI-NEXT: v_mov_b32_e32 v15, s51 +; SI-NEXT: .LBB69_5: ; %end +; SI-NEXT: v_readlane_b32 s51, v16, 7 +; SI-NEXT: v_readlane_b32 s50, v16, 6 +; SI-NEXT: v_readlane_b32 s49, v16, 5 +; SI-NEXT: v_readlane_b32 s48, v16, 4 +; SI-NEXT: v_readlane_b32 s39, v16, 3 +; SI-NEXT: v_readlane_b32 s38, v16, 2 +; SI-NEXT: v_readlane_b32 s37, v16, 1 +; SI-NEXT: v_readlane_b32 s36, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f16_to_v5i64_scalar: ; VI: ; %bb.0: @@ -28894,70 +28490,30 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB70_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v13, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB70_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB70_4 @@ -28972,78 +28528,48 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v11, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v12, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v13, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; SI-NEXT: .LBB70_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v11 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i64_to_v20f16: @@ -29142,141 +28668,91 @@ define inreg <20 x half> @bitcast_v5i64_to_v20f16_scalar(<5 x i64> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB71_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 +; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: s_lshr_b32 s27, s23, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB71_3 ; SI-NEXT: .LBB71_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: s_lshr_b32 s7, s5, 16 -; SI-NEXT: s_add_u32 s8, s18, 3 -; SI-NEXT: s_addc_u32 s9, s19, 0 -; SI-NEXT: s_lshr_b32 s10, s8, 16 -; SI-NEXT: s_lshr_b32 s11, s9, 16 -; SI-NEXT: s_add_u32 s12, s20, 3 -; SI-NEXT: s_addc_u32 s13, s21, 0 -; SI-NEXT: s_lshr_b32 s14, s12, 16 -; SI-NEXT: s_lshr_b32 s15, s13, 16 -; SI-NEXT: s_add_u32 s16, s22, 3 -; SI-NEXT: s_addc_u32 s17, s23, 0 -; SI-NEXT: s_lshr_b32 s18, s16, 16 -; SI-NEXT: s_lshr_b32 s19, s17, 16 -; SI-NEXT: s_add_u32 s20, s24, 3 -; SI-NEXT: s_addc_u32 s21, s25, 0 -; SI-NEXT: s_lshr_b32 s22, s20, 16 -; SI-NEXT: s_lshr_b32 s23, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: s_lshr_b32 s27, s23, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 ; SI-NEXT: .LBB71_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_or_b32_e32 v0, v18, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_or_b32_e32 v2, v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; SI-NEXT: v_or_b32_e32 v5, v5, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v7, v7, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_or_b32_e32 v3, v18, v3 -; SI-NEXT: v_or_b32_e32 v4, v15, v4 -; SI-NEXT: v_or_b32_e32 v6, v13, v6 -; SI-NEXT: v_or_b32_e32 v8, v11, v8 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s12, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s40, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s19, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s20, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s11, s8 +; SI-NEXT: s_and_b32 s11, s21, 0xffff +; SI-NEXT: s_lshl_b32 s12, s28, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s22, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s12, s6 +; SI-NEXT: s_and_b32 s12, s23, 0xffff +; SI-NEXT: s_lshl_b32 s13, s27, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s24, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s13, s4 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s26, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s13 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB71_4: -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB71_2 ; ; VI-LABEL: bitcast_v5i64_to_v20f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index 040f0c8b4d299..8026714f25992 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -667,28 +667,20 @@ define <2 x half> @bitcast_i32_to_v2f16(i32 %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: .LBB8_4: ; %end +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i32_to_v2f16: @@ -754,24 +746,19 @@ define inreg <2 x half> @bitcast_i32_to_v2f16_scalar(i32 inreg %a, i32 inreg %b) ; SI-NEXT: s_cmp_lg_u32 s17, 0 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: .LBB9_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_i32_to_v2f16_scalar: @@ -844,38 +831,31 @@ define i32 @bitcast_v2f16_to_i32(<2 x half> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB10_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB10_4 -; SI-NEXT: .LBB10_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB10_2 -; SI-NEXT: .LBB10_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: s_cbranch_execz .LBB10_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: .LBB10_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_i32: @@ -942,31 +922,30 @@ define inreg i32 @bitcast_v2f16_to_i32_scalar(<2 x half> inreg %a, i32 inreg %b) ; SI-LABEL: bitcast_v2f16_to_i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB11_4 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: .LBB11_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB11_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: .LBB11_3: +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_i32_scalar: ; VI: ; %bb.0: @@ -2957,28 +2936,20 @@ define <2 x half> @bitcast_f32_to_v2f16(float %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB28_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f32_to_v2f16: @@ -3042,27 +3013,25 @@ define inreg <2 x half> @bitcast_f32_to_v2f16_scalar(float inreg %a, i32 inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: s_cbranch_scc0 .LBB29_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB29_4 ; SI-NEXT: .LBB29_2: ; %cmp.true ; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_branch .LBB29_5 +; SI-NEXT: .LBB29_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB29_2 +; SI-NEXT: .LBB29_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: .LBB29_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_f32_to_v2f16_scalar: ; VI: ; %bb.0: @@ -3135,38 +3104,31 @@ define float @bitcast_v2f16_to_f32(<2 x half> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB30_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB30_4 -; SI-NEXT: .LBB30_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB30_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB30_2 -; SI-NEXT: .LBB30_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: s_cbranch_execz .LBB30_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: .LBB30_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_f32: @@ -3233,31 +3195,30 @@ define inreg float @bitcast_v2f16_to_f32_scalar(<2 x half> inreg %a, i32 inreg % ; SI-LABEL: bitcast_v2f16_to_f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: s_cbranch_scc0 .LBB31_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB31_4 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: .LBB31_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB31_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: .LBB31_3: +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB31_2 +; SI-NEXT: .LBB31_4: +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_f32_scalar: ; VI: ; %bb.0: @@ -4901,31 +4862,30 @@ define <2 x half> @bitcast_v2i16_to_v2f16(<2 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v2i16_to_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB44_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i16_to_v2f16: @@ -4995,23 +4955,25 @@ define inreg <2 x half> @bitcast_v2i16_to_v2f16_scalar(<2 x i16> inreg %a, i32 i ; SI-NEXT: s_cmp_lg_u32 s17, 0 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: s_add_i32 s5, s6, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s6, s5, 16 +; SI-NEXT: s_or_b32 s7, s4, s6 +; SI-NEXT: s_and_b32 s6, s5, 0xffff ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v2i16_to_v2f16_scalar: @@ -5088,11 +5050,7 @@ define <2 x i16> @bitcast_v2f16_to_v2i16(<2 x half> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -5177,31 +5135,31 @@ define inreg <2 x i16> @bitcast_v2f16_to_v2i16_scalar(<2 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v2f16_to_v2i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: s_cbranch_scc0 .LBB47_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: s_cbranch_execnz .LBB47_4 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_branch .LBB47_5 +; SI-NEXT: .LBB47_3: +; SI-NEXT: s_branch .LBB47_2 +; SI-NEXT: .LBB47_4: +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: .LBB47_5: ; %end ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB47_4: -; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v2f16_to_v2i16_scalar: ; VI: ; %bb.0: @@ -6911,39 +6869,35 @@ define <2 x bfloat> @bitcast_v2f16_to_v2bf16(<2 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v2f16_to_v2bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB60_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: .LBB60_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_v2bf16: @@ -7010,36 +6964,36 @@ define inreg <2 x bfloat> @bitcast_v2f16_to_v2bf16_scalar(<2 x half> inreg %a, i ; SI-LABEL: bitcast_v2f16_to_v2bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: s_cbranch_scc0 .LBB61_4 +; SI-NEXT: s_cbranch_scc0 .LBB61_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_cbranch_execnz .LBB61_3 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_lshl_b32 s8, s6, 16 +; SI-NEXT: s_cbranch_execnz .LBB61_4 ; SI-NEXT: .LBB61_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: .LBB61_3: ; %end -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: s_branch .LBB61_5 +; SI-NEXT: .LBB61_3: +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB61_2 +; SI-NEXT: .LBB61_4: +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: .LBB61_5: ; %end ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[0:1], v[1:2], 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB61_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: s_branch .LBB61_2 ; ; VI-LABEL: bitcast_v2f16_to_v2bf16_scalar: ; VI: ; %bb.0: @@ -7117,41 +7071,34 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v2bf16_to_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_alignbit_b32 v1, v0, v3, 16 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: .LBB62_2: ; %Flow +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB62_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: .LBB62_4: ; %end +; SI-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2bf16_to_v2f16: @@ -7309,33 +7256,27 @@ define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i ; SI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s5, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s4 ; SI-NEXT: v_mul_f32_e64 v0, 1.0, s5 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: s_cbranch_scc0 .LBB63_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 ; SI-NEXT: s_cbranch_execnz .LBB63_3 ; SI-NEXT: .LBB63_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 ; SI-NEXT: .LBB63_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB63_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_branch .LBB63_2 ; ; VI-LABEL: bitcast_v2bf16_to_v2f16_scalar: @@ -7511,38 +7452,31 @@ define <1 x i32> @bitcast_v2f16_to_v1i32(<2 x half> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB64_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB64_4 -; SI-NEXT: .LBB64_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB64_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB64_2 -; SI-NEXT: .LBB64_4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: s_cbranch_execz .LBB64_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: .LBB64_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_v1i32: @@ -7609,31 +7543,30 @@ define inreg <1 x i32> @bitcast_v2f16_to_v1i32_scalar(<2 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v2f16_to_v1i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_cbranch_scc0 .LBB65_4 +; SI-NEXT: s_cbranch_scc0 .LBB65_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_cbranch_execnz .LBB65_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB65_4 ; SI-NEXT: .LBB65_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: .LBB65_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB65_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: .LBB65_3: +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB65_2 +; SI-NEXT: .LBB65_4: +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_v1i32_scalar: ; VI: ; %bb.0: @@ -7713,28 +7646,20 @@ define <2 x half> @bitcast_v1i32_to_v2f16(<1 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB66_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: .LBB66_4: ; %end +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v1i32_to_v2f16: @@ -7800,24 +7725,19 @@ define inreg <2 x half> @bitcast_v1i32_to_v2f16_scalar(<1 x i32> inreg %a, i32 i ; SI-NEXT: s_cmp_lg_u32 s17, 0 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: .LBB67_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB67_2 ; ; VI-LABEL: bitcast_v1i32_to_v2f16_scalar: @@ -7889,15 +7809,12 @@ define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v2f16_to_v4i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execnz .LBB68_3 @@ -7908,8 +7825,9 @@ define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB68_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; SI-NEXT: v_bfe_u32 v3, v2, 8, 8 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -8071,22 +7989,19 @@ define inreg <4 x i8> @bitcast_v2f16_to_v4i8_scalar(<2 x half> inreg %a, i32 inr ; SI-LABEL: bitcast_v2f16_to_v4i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: s_cbranch_scc0 .LBB69_4 +; SI-NEXT: s_cbranch_scc0 .LBB69_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_bfe_u32 v3, v2, 8, 8 -; SI-NEXT: s_cbranch_execnz .LBB69_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_lshr_b32 s8, s7, 8 +; SI-NEXT: s_bfe_u32 s9, s6, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB69_4 ; SI-NEXT: .LBB69_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 @@ -8095,13 +8010,18 @@ define inreg <4 x i8> @bitcast_v2f16_to_v4i8_scalar(<2 x half> inreg %a, i32 inr ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; SI-NEXT: v_bfe_u32 v3, v2, 8, 8 -; SI-NEXT: .LBB69_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB69_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: .LBB69_3: +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: s_branch .LBB69_2 +; SI-NEXT: .LBB69_4: +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_v4i8_scalar: ; VI: ; %bb.0: @@ -8215,43 +8135,45 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB70_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: v_or_b32_e32 v2, v1, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v2 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: .LBB70_2: ; %Flow +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB70_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v2 ; SI-NEXT: .LBB70_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i8_to_v2f16: @@ -8456,34 +8378,38 @@ define inreg <2 x half> @bitcast_v4i8_to_v2f16_scalar(<4 x i8> inreg %a, i32 inr ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s19, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s6, s19, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s5, 16 +; SI-NEXT: s_or_b32 s6, s4, s6 +; SI-NEXT: s_and_b32 s7, s5, 0xffff ; SI-NEXT: s_cbranch_execnz .LBB71_3 ; SI-NEXT: .LBB71_2: ; %cmp.true -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s19, 8 ; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s16, 0xff -; SI-NEXT: s_lshl_b32 s6, s17, 8 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s6, s19, 8 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s5, 16 +; SI-NEXT: s_or_b32 s6, s4, s6 +; SI-NEXT: s_and_b32 s7, s5, 0xffff ; SI-NEXT: .LBB71_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB71_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB71_2 ; ; VI-LABEL: bitcast_v4i8_to_v2f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll index e81978684b8b6..70ed2ca42b706 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll @@ -1503,170 +1503,93 @@ define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB8_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_alignbit_b32 v12, v0, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v13, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v14, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v15, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v18, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 ; SI-NEXT: .LBB8_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB8_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_alignbit_b32 v11, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v13, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v14, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v15, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v18, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v12, v0, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 ; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; SI-NEXT: v_or_b32_e32 v2, v2, v15 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v6, v6, v13 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v18 +; SI-NEXT: v_or_b32_e32 v3, v3, v15 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_or_b32_e32 v7, v7, v13 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11i32_to_v22f16: @@ -1765,154 +1688,99 @@ define inreg <22 x half> @bitcast_v11i32_to_v22f16_scalar(<11 x i32> inreg %a, i ; SI-NEXT: s_cmp_lg_u32 s27, 0 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_lshr_b32 s27, s25, 16 +; SI-NEXT: s_lshr_b32 s40, s23, 16 +; SI-NEXT: s_lshr_b32 s41, s21, 16 +; SI-NEXT: s_lshr_b32 s42, s19, 16 +; SI-NEXT: s_lshr_b32 s43, s17, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true -; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s5 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_lshr_b32 s27, s25, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s40, s23, 16 +; SI-NEXT: s_lshr_b32 s41, s21, 16 +; SI-NEXT: s_lshr_b32 s42, s19, 16 +; SI-NEXT: s_lshr_b32 s43, s17, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[26:27], 16 ; SI-NEXT: .LBB9_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v5, v15, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 -; SI-NEXT: v_or_b32_e32 v7, v13, v7 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v8, v8, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v0, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v20 -; SI-NEXT: v_or_b32_e32 v4, v19, v4 -; SI-NEXT: v_or_b32_e32 v6, v17, v6 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s14, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s43, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s12, s42, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s20, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s10, s12, s10 +; SI-NEXT: s_and_b32 s12, s21, 0xffff +; SI-NEXT: s_lshl_b32 s13, s41, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s22, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s13, s6 +; SI-NEXT: s_and_b32 s13, s23, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s24, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s14, s4 +; SI-NEXT: s_and_b32 s14, s25, 0xffff +; SI-NEXT: s_lshl_b32 s15, s27, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s26, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s15, s8 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s12 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_mov_b32_e32 v10, s8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_v11i32_to_v22f16_scalar: @@ -2038,61 +1906,28 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v22f16_to_v11i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_mov_b32_e32 v22, v10 +; SI-NEXT: v_mov_b32_e32 v12, v9 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v15, v6 +; SI-NEXT: v_mov_b32_e32 v16, v5 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v20, v1 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -2105,28 +1940,50 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 -; SI-NEXT: v_or_b32_e32 v2, v28, v2 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 -; SI-NEXT: v_or_b32_e32 v4, v24, v4 -; SI-NEXT: v_or_b32_e32 v5, v22, v5 -; SI-NEXT: v_or_b32_e32 v6, v20, v6 -; SI-NEXT: v_or_b32_e32 v7, v18, v7 -; SI-NEXT: v_or_b32_e32 v8, v16, v8 -; SI-NEXT: v_or_b32_e32 v9, v14, v9 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -2138,24 +1995,13 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2163,10 +2009,10 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2174,11 +2020,11 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2186,11 +2032,11 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -2198,11 +2044,11 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -2210,11 +2056,11 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -2222,12 +2068,12 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -2360,92 +2206,59 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-LABEL: bitcast_v22f16_to_v11i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s5, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: s_lshr_b32 s5, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_lshr_b32 s5, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: s_lshr_b32 s5, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: s_lshr_b32 s5, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 -; SI-NEXT: s_lshr_b32 s5, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 -; SI-NEXT: s_lshr_b32 s5, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 -; SI-NEXT: s_lshr_b32 s5, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s5 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: s_lshr_b32 s15, s26, 16 +; SI-NEXT: s_lshr_b32 s40, s25, 16 +; SI-NEXT: s_lshr_b32 s41, s24, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s22, 16 +; SI-NEXT: s_lshr_b32 s44, s21, 16 +; SI-NEXT: s_lshr_b32 s45, s20, 16 +; SI-NEXT: s_lshr_b32 s46, s19, 16 +; SI-NEXT: s_lshr_b32 s47, s18, 16 +; SI-NEXT: s_lshr_b32 s56, s17, 16 +; SI-NEXT: s_lshr_b32 s57, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s27, 0 -; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 -; SI-NEXT: v_or_b32_e32 v1, v29, v1 -; SI-NEXT: v_or_b32_e32 v2, v27, v2 -; SI-NEXT: v_or_b32_e32 v3, v25, v3 -; SI-NEXT: v_or_b32_e32 v4, v23, v4 -; SI-NEXT: v_or_b32_e32 v5, v21, v5 -; SI-NEXT: v_or_b32_e32 v6, v19, v6 -; SI-NEXT: v_or_b32_e32 v7, v17, v7 -; SI-NEXT: v_or_b32_e32 v8, v15, v8 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s56, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s47, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s46, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s45, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s44, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s43, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s42, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s41, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s27, s15, 16 +; SI-NEXT: s_or_b32 s14, s14, s27 +; SI-NEXT: s_cbranch_execnz .LBB11_4 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2453,10 +2266,10 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s47 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2464,11 +2277,11 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s46 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -2476,11 +2289,11 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -2488,11 +2301,11 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s43 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s42 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -2500,11 +2313,11 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -2512,29 +2325,41 @@ define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s40 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB11_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: .LBB11_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 ; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f16_to_v11i32_scalar: ; VI: ; %bb.0: @@ -3690,170 +3515,93 @@ define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_alignbit_b32 v12, v0, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v13, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v14, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v15, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v18, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v11, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v13, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v14, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v15, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v18, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v12, v0, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; SI-NEXT: v_or_b32_e32 v2, v2, v15 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v6, v6, v13 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v18 +; SI-NEXT: v_or_b32_e32 v3, v3, v15 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_or_b32_e32 v7, v7, v13 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f32_to_v22f16: @@ -3944,157 +3692,115 @@ define inreg <22 x half> @bitcast_v11f32_to_v22f16_scalar(<11 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s27, 0 -; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: s_cbranch_scc0 .LBB17_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s16 -; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: s_lshr_b32 s27, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s41, s21, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB17_4 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v0, s17, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e64 v19, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 ; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s26, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v0 +; SI-NEXT: v_lshr_b64 v[11:12], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 ; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v5, v15, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 -; SI-NEXT: v_or_b32_e32 v7, v13, v7 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v8, v8, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_lshr_b64 v[12:13], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: s_branch .LBB17_5 +; SI-NEXT: .LBB17_3: +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB17_2 +; SI-NEXT: .LBB17_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v23, s27 +; SI-NEXT: v_mov_b32_e32 v22, s40 +; SI-NEXT: v_mov_b32_e32 v21, s41 +; SI-NEXT: v_mov_b32_e32 v20, s42 +; SI-NEXT: v_mov_b32_e32 v19, s43 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v17, s14 +; SI-NEXT: v_mov_b32_e32 v16, s12 +; SI-NEXT: v_mov_b32_e32 v15, s10 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: .LBB17_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v21, v0 -; SI-NEXT: v_or_b32_e32 v1, v1, v20 -; SI-NEXT: v_or_b32_e32 v4, v19, v4 -; SI-NEXT: v_or_b32_e32 v6, v17, v6 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v11f32_to_v22f16_scalar: ; VI: ; %bb.0: @@ -4237,61 +3943,28 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v22f16_to_v11f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 +; SI-NEXT: v_mov_b32_e32 v22, v10 +; SI-NEXT: v_mov_b32_e32 v12, v9 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v15, v6 +; SI-NEXT: v_mov_b32_e32 v16, v5 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v20, v1 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4304,28 +3977,50 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 -; SI-NEXT: v_or_b32_e32 v2, v28, v2 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 -; SI-NEXT: v_or_b32_e32 v4, v24, v4 -; SI-NEXT: v_or_b32_e32 v5, v22, v5 -; SI-NEXT: v_or_b32_e32 v6, v20, v6 -; SI-NEXT: v_or_b32_e32 v7, v18, v7 -; SI-NEXT: v_or_b32_e32 v8, v16, v8 -; SI-NEXT: v_or_b32_e32 v9, v14, v9 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -4337,24 +4032,13 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4362,10 +4046,10 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -4373,11 +4057,11 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -4385,11 +4069,11 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -4397,11 +4081,11 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -4409,11 +4093,11 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -4421,12 +4105,12 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -4559,92 +4243,59 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-LABEL: bitcast_v22f16_to_v11f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s5, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: s_lshr_b32 s5, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_lshr_b32 s5, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: s_lshr_b32 s5, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: s_lshr_b32 s5, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 -; SI-NEXT: s_lshr_b32 s5, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 -; SI-NEXT: s_lshr_b32 s5, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 -; SI-NEXT: s_lshr_b32 s5, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s5 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: s_lshr_b32 s15, s26, 16 +; SI-NEXT: s_lshr_b32 s40, s25, 16 +; SI-NEXT: s_lshr_b32 s41, s24, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s22, 16 +; SI-NEXT: s_lshr_b32 s44, s21, 16 +; SI-NEXT: s_lshr_b32 s45, s20, 16 +; SI-NEXT: s_lshr_b32 s46, s19, 16 +; SI-NEXT: s_lshr_b32 s47, s18, 16 +; SI-NEXT: s_lshr_b32 s56, s17, 16 +; SI-NEXT: s_lshr_b32 s57, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s27, 0 -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 -; SI-NEXT: v_or_b32_e32 v1, v29, v1 -; SI-NEXT: v_or_b32_e32 v2, v27, v2 -; SI-NEXT: v_or_b32_e32 v3, v25, v3 -; SI-NEXT: v_or_b32_e32 v4, v23, v4 -; SI-NEXT: v_or_b32_e32 v5, v21, v5 -; SI-NEXT: v_or_b32_e32 v6, v19, v6 -; SI-NEXT: v_or_b32_e32 v7, v17, v7 -; SI-NEXT: v_or_b32_e32 v8, v15, v8 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s56, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s47, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s46, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s45, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s44, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s43, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s42, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s41, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s27, s15, 16 +; SI-NEXT: s_or_b32 s14, s14, s27 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4652,10 +4303,10 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s47 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -4663,11 +4314,11 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s46 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -4675,11 +4326,11 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -4687,11 +4338,11 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s43 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s42 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -4699,11 +4350,11 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -4711,29 +4362,41 @@ define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s40 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f16_to_v11f32_scalar: ; VI: ; %bb.0: @@ -4910,66 +4573,76 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v22i16_to_v22f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v24, v1, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v29, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v22, v1, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v25, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v19, v1, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v23, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v16, v1, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v20, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v11, v1, v51 +; SI-NEXT: v_or_b32_e32 v17, v0, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_alignbit_b32 v26, v24, v33, 16 +; SI-NEXT: v_alignbit_b32 v27, v22, v36, 16 +; SI-NEXT: v_alignbit_b32 v28, v19, v38, 16 +; SI-NEXT: v_alignbit_b32 v30, v16, v48, 16 +; SI-NEXT: v_alignbit_b32 v31, v11, v50, 16 +; SI-NEXT: v_or_b32_e32 v32, v0, v34 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -4981,111 +4654,112 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v39 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v37 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v35 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_or_b32_e32 v6, v48, v6 +; SI-NEXT: v_or_b32_e32 v4, v38, v4 +; SI-NEXT: v_or_b32_e32 v2, v36, v2 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v8, v51, v8 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v26, v24, v29, 16 +; SI-NEXT: v_alignbit_b32 v27, v22, v25, 16 +; SI-NEXT: v_alignbit_b32 v28, v19, v23, 16 +; SI-NEXT: v_alignbit_b32 v30, v16, v20, 16 +; SI-NEXT: v_alignbit_b32 v31, v11, v17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v32 ; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v28 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v30 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22i16_to_v22f16: @@ -5204,157 +4878,189 @@ define inreg <22 x half> @bitcast_v22i16_to_v22f16_scalar(<22 x i16> inreg %a, i ; SI-LABEL: bitcast_v22i16_to_v22f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s28, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s56, s26, 16 +; SI-NEXT: s_lshr_b32 s61, s25, 16 +; SI-NEXT: s_lshr_b32 s74, s24, 16 +; SI-NEXT: s_lshr_b32 s60, s23, 16 +; SI-NEXT: s_lshr_b32 s73, s22, 16 +; SI-NEXT: s_lshr_b32 s59, s21, 16 +; SI-NEXT: s_lshr_b32 s72, s20, 16 +; SI-NEXT: s_lshr_b32 s58, s19, 16 +; SI-NEXT: s_lshr_b32 s63, s18, 16 +; SI-NEXT: s_lshr_b32 s57, s17, 16 +; SI-NEXT: s_lshr_b32 s62, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s27, 0 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s57, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s14, s62, 16 +; SI-NEXT: s_or_b32 s15, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s58, 16 +; SI-NEXT: s_or_b32 s12, s4, s14 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s28, s63, 16 +; SI-NEXT: s_or_b32 s29, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s59, 16 +; SI-NEXT: s_or_b32 s10, s4, s28 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s40, s72, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s60, 16 +; SI-NEXT: s_or_b32 s8, s4, s40 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s42, s73, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s61, 16 +; SI-NEXT: s_or_b32 s6, s4, s42 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s44, s74, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s26, 0xffff +; SI-NEXT: s_lshl_b32 s7, s56, 16 +; SI-NEXT: s_or_b32 s4, s4, s44 +; SI-NEXT: s_or_b32 s27, s5, s7 +; SI-NEXT: s_mov_b32 s13, s15 +; SI-NEXT: s_lshr_b64 s[14:15], s[14:15], 16 +; SI-NEXT: s_mov_b32 s11, s29 +; SI-NEXT: s_lshr_b64 s[28:29], s[28:29], 16 +; SI-NEXT: s_mov_b32 s9, s41 +; SI-NEXT: s_lshr_b64 s[40:41], s[40:41], 16 +; SI-NEXT: s_mov_b32 s7, s43 +; SI-NEXT: s_lshr_b64 s[42:43], s[42:43], 16 +; SI-NEXT: s_mov_b32 s5, s45 +; SI-NEXT: s_lshr_b64 s[44:45], s[44:45], 16 ; SI-NEXT: s_cbranch_execnz .LBB21_3 ; SI-NEXT: .LBB21_2: ; %cmp.true -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s74, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s6, s61, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s22, 0xffff +; SI-NEXT: s_lshl_b32 s7, s73, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s23, 0xffff +; SI-NEXT: s_lshl_b32 s8, s60, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s72, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s59, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s63, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s12, s58, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s16, 0xffff +; SI-NEXT: s_lshl_b32 s13, s62, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s17, 0xffff +; SI-NEXT: s_lshl_b32 s14, s57, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s15, s56, 16 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s27, s14, 0x30000 +; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s57, s13, 16 +; SI-NEXT: s_lshr_b32 s58, s11, 16 +; SI-NEXT: s_lshr_b32 s59, s9, 16 +; SI-NEXT: s_lshr_b32 s60, s7, 16 +; SI-NEXT: s_lshr_b32 s61, s5, 16 +; SI-NEXT: s_lshr_b32 s56, s27, 16 ; SI-NEXT: .LBB21_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v16 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: v_or_b32_e32 v8, v11, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s14, s57, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s14, s28, 16 +; SI-NEXT: s_or_b32 s10, s10, s14 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s14, s58, 16 +; SI-NEXT: s_or_b32 s11, s11, s14 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s14, s40, 16 +; SI-NEXT: s_or_b32 s8, s8, s14 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s14, s59, 16 +; SI-NEXT: s_or_b32 s9, s9, s14 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_or_b32 s6, s6, s14 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s14, s60, 16 +; SI-NEXT: s_or_b32 s7, s7, s14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s14, s44, 16 +; SI-NEXT: s_or_b32 s4, s4, s14 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s14, s61, 16 +; SI-NEXT: s_or_b32 s5, s5, s14 +; SI-NEXT: s_and_b32 s14, s27, 0xffff +; SI-NEXT: s_lshl_b32 s15, s56, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s5 +; SI-NEXT: v_mov_b32_e32 v10, s14 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB21_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: s_branch .LBB21_2 ; ; VI-LABEL: bitcast_v22i16_to_v22f16_scalar: @@ -5536,62 +5242,18 @@ define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v22f16_to_v22i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -5599,131 +5261,131 @@ define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { ; SI-NEXT: ; %bb.1: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_or_b32_e32 v10, v10, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_or_b32_e32 v9, v9, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v5, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v17 ; SI-NEXT: v_or_b32_e32 v3, v3, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 ; SI-NEXT: v_or_b32_e32 v1, v1, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 -; SI-NEXT: v_or_b32_e32 v17, v17, v20 -; SI-NEXT: v_or_b32_e32 v14, v14, v19 -; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_or_b32_e32 v13, v13, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v19 +; SI-NEXT: v_or_b32_e32 v6, v6, v18 +; SI-NEXT: v_or_b32_e32 v8, v8, v15 ; SI-NEXT: v_alignbit_b32 v21, v1, v21, 16 ; SI-NEXT: v_alignbit_b32 v20, v3, v20, 16 ; SI-NEXT: v_alignbit_b32 v19, v5, v19, 16 ; SI-NEXT: v_alignbit_b32 v18, v7, v18, 16 -; SI-NEXT: v_alignbit_b32 v16, v9, v16, 16 +; SI-NEXT: v_alignbit_b32 v15, v9, v15, 16 ; SI-NEXT: .LBB22_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_or_b32_e32 v2, v2, v11 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v3, v3, v11 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v11 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v11 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; SI-NEXT: v_or_b32_e32 v4, v4, v14 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 ; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 -; SI-NEXT: v_or_b32_e32 v2, v2, v17 -; SI-NEXT: v_or_b32_e32 v6, v6, v14 -; SI-NEXT: v_or_b32_e32 v8, v8, v13 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5844,196 +5506,176 @@ define inreg <22 x i16> @bitcast_v22f16_to_v22i16_scalar(<22 x half> inreg %a, i ; SI-LABEL: bitcast_v22f16_to_v22i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 +; SI-NEXT: s_lshr_b32 s12, s26, 16 +; SI-NEXT: s_lshr_b32 s9, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s28, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s27, 0 -; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: s_cbranch_scc0 .LBB23_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: s_cbranch_execnz .LBB23_4 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_or_b32_e32 v23, v10, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s13 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s23 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v7, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_or_b32_e32 v5, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s28 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 +; SI-NEXT: v_or_b32_e32 v3, v12, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s18 +; SI-NEXT: v_or_b32_e32 v25, v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s15 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v31 -; SI-NEXT: v_or_b32_e32 v29, v10, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v26 +; SI-NEXT: v_or_b32_e32 v30, v10, v0 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s24 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v31, v11, v2 -; SI-NEXT: v_or_b32_e32 v30, v10, v4 +; SI-NEXT: v_or_b32_e32 v31, v10, v2 +; SI-NEXT: v_or_b32_e32 v29, v11, v4 ; SI-NEXT: v_or_b32_e32 v28, v12, v6 -; SI-NEXT: v_or_b32_e32 v26, v13, v8 +; SI-NEXT: v_or_b32_e32 v27, v13, v8 ; SI-NEXT: v_lshr_b64 v[18:19], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 ; SI-NEXT: v_lshr_b64 v[14:15], v[4:5], 16 ; SI-NEXT: v_lshr_b64 v[12:13], v[6:7], 16 ; SI-NEXT: v_lshr_b64 v[10:11], v[8:9], 16 -; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_branch .LBB23_5 +; SI-NEXT: .LBB23_3: +; SI-NEXT: s_branch .LBB23_2 +; SI-NEXT: .LBB23_4: +; SI-NEXT: v_mov_b32_e32 v20, s12 +; SI-NEXT: v_mov_b32_e32 v24, s9 +; SI-NEXT: v_mov_b32_e32 v21, s10 +; SI-NEXT: v_mov_b32_e32 v22, s8 +; SI-NEXT: v_mov_b32_e32 v23, s7 +; SI-NEXT: v_mov_b32_e32 v26, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v25, s26 +; SI-NEXT: v_mov_b32_e32 v27, s24 +; SI-NEXT: v_mov_b32_e32 v28, s22 +; SI-NEXT: v_mov_b32_e32 v29, s20 +; SI-NEXT: v_mov_b32_e32 v31, s18 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v16, s15 +; SI-NEXT: v_mov_b32_e32 v14, s14 +; SI-NEXT: v_mov_b32_e32 v12, s13 +; SI-NEXT: v_mov_b32_e32 v10, s11 +; SI-NEXT: .LBB23_5: ; %end ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v31 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v29 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v28 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v12 ; SI-NEXT: v_or_b32_e32 v6, v6, v8 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v27 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v8, v8, v10 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v25 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB23_4: -; SI-NEXT: s_branch .LBB23_2 ; ; VI-LABEL: bitcast_v22f16_to_v22i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll index 50dfbb9a5d234..60c5431f7e4c6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll @@ -2664,184 +2664,100 @@ define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v28 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v17 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v14 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_or_b32_e32 v2, v2, v17 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v17 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i32_to_v24f16: @@ -2943,167 +2859,107 @@ define inreg <24 x half> @bitcast_v12i32_to_v24f16_scalar(<12 x i32> inreg %a, i ; SI-NEXT: s_cmp_lg_u32 s28, 0 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v17 -; SI-NEXT: v_or_b32_e32 v6, v16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; SI-NEXT: v_or_b32_e32 v9, v9, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v3, v22, v3 -; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: v_or_b32_e32 v7, v18, v7 -; SI-NEXT: v_or_b32_e32 v8, v15, v8 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s14, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s45, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s12, s44, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s20, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s10, s12, s10 +; SI-NEXT: s_and_b32 s12, s21, 0xffff +; SI-NEXT: s_lshl_b32 s13, s43, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s13, s8 +; SI-NEXT: s_and_b32 s13, s23, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s24, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s14, s6 +; SI-NEXT: s_and_b32 s14, s25, 0xffff +; SI-NEXT: s_lshl_b32 s15, s41, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s26, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s15, s4 +; SI-NEXT: s_and_b32 s15, s27, 0xffff +; SI-NEXT: s_lshl_b32 s16, s40, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s12 +; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: v_mov_b32_e32 v11, s15 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v12i32_to_v24f16_scalar: @@ -3234,66 +3090,30 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v24f16_to_v12i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 +; SI-NEXT: v_mov_b32_e32 v24, v11 +; SI-NEXT: v_mov_b32_e32 v13, v10 +; SI-NEXT: v_mov_b32_e32 v14, v9 +; SI-NEXT: v_mov_b32_e32 v15, v8 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v17, v6 +; SI-NEXT: v_mov_b32_e32 v18, v5 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v20, v3 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v22, v1 +; SI-NEXT: v_mov_b32_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3306,30 +3126,54 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v31, v2 -; SI-NEXT: v_or_b32_e32 v3, v29, v3 -; SI-NEXT: v_or_b32_e32 v4, v27, v4 -; SI-NEXT: v_or_b32_e32 v5, v25, v5 -; SI-NEXT: v_or_b32_e32 v6, v23, v6 -; SI-NEXT: v_or_b32_e32 v7, v21, v7 -; SI-NEXT: v_or_b32_e32 v8, v19, v8 -; SI-NEXT: v_or_b32_e32 v9, v17, v9 -; SI-NEXT: v_or_b32_e32 v10, v15, v10 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v27 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 @@ -3342,25 +3186,13 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -3373,10 +3205,10 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -3385,25 +3217,25 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v18 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -3411,11 +3243,11 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -3423,11 +3255,11 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -3435,12 +3267,12 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -3578,99 +3410,63 @@ define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i ; SI-LABEL: bitcast_v24f16_to_v12i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s6, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: s_lshr_b32 s6, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: s_lshr_b32 s6, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: s_lshr_b32 s6, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: s_lshr_b32 s6, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: s_lshr_b32 s6, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: s_lshr_b32 s6, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: s_lshr_b32 s6, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: s_lshr_b32 s6, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: s_lshr_b32 s5, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 -; SI-NEXT: s_lshr_b32 s6, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s26, 16 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s24, 16 +; SI-NEXT: s_lshr_b32 s44, s23, 16 +; SI-NEXT: s_lshr_b32 s45, s22, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s47, s20, 16 +; SI-NEXT: s_lshr_b32 s56, s19, 16 +; SI-NEXT: s_lshr_b32 s57, s18, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s28, 0 -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v30, v2 -; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_or_b32_e32 v4, v26, v4 -; SI-NEXT: v_or_b32_e32 v5, v24, v5 -; SI-NEXT: v_or_b32_e32 v6, v22, v6 -; SI-NEXT: v_or_b32_e32 v7, v20, v7 -; SI-NEXT: v_or_b32_e32 v8, v18, v8 -; SI-NEXT: v_or_b32_e32 v9, v15, v9 -; SI-NEXT: v_or_b32_e32 v10, v14, v10 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s58, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s57, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s56, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s47, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s46, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s45, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s44, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s43, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s15, s41, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s27, 0xffff +; SI-NEXT: s_lshl_b32 s28, s40, 16 +; SI-NEXT: s_or_b32 s15, s15, s28 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3679,41 +3475,41 @@ define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s56 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s47 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s46 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -3721,11 +3517,11 @@ define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s44 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s43 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -3733,11 +3529,11 @@ define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -3745,29 +3541,42 @@ define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s41 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: v_mov_b32_e32 v11, s15 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f16_to_v12i32_scalar: ; VI: ; %bb.0: @@ -6098,184 +5907,100 @@ define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v28 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v17 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v14 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_or_b32_e32 v2, v2, v17 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v17 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f32_to_v24f16: @@ -6360,178 +6085,132 @@ cmp.false: end: %phi = phi <24 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <24 x half> %phi -} - -define inreg <24 x half> @bitcast_v12f32_to_v24f16_scalar(<12 x float> inreg %a, i32 inreg %b) { -; SI-LABEL: bitcast_v12f32_to_v24f16_scalar: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s28, 0 -; SI-NEXT: s_cbranch_scc0 .LBB33_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: s_cbranch_execnz .LBB33_3 -; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v23, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v22, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v21, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v19, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s27, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v17 -; SI-NEXT: v_or_b32_e32 v6, v16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; SI-NEXT: v_or_b32_e32 v9, v9, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v3, v22, v3 -; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: v_or_b32_e32 v7, v18, v7 -; SI-NEXT: v_or_b32_e32 v8, v15, v8 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr10 + ret <24 x half> %phi +} + +define inreg <24 x half> @bitcast_v12f32_to_v24f16_scalar(<12 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f32_to_v24f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB33_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s44, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s23, 16 +; SI-NEXT: s_lshr_b32 s42, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB33_4 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_lshr_b64 v[12:13], v[10:11], 16 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_lshr_b64 v[13:14], v[8:9], 16 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_lshr_b64 v[14:15], v[6:7], 16 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[15:16], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: s_branch .LBB33_5 +; SI-NEXT: .LBB33_3: +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: s_branch .LBB33_2 +; SI-NEXT: .LBB33_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v24, s40 +; SI-NEXT: v_mov_b32_e32 v23, s41 +; SI-NEXT: v_mov_b32_e32 v22, s42 +; SI-NEXT: v_mov_b32_e32 v21, s43 +; SI-NEXT: v_mov_b32_e32 v20, s44 +; SI-NEXT: v_mov_b32_e32 v19, s45 +; SI-NEXT: v_mov_b32_e32 v17, s14 +; SI-NEXT: v_mov_b32_e32 v16, s12 +; SI-NEXT: v_mov_b32_e32 v15, s10 +; SI-NEXT: v_mov_b32_e32 v14, s8 +; SI-NEXT: v_mov_b32_e32 v13, s6 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: .LBB33_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f32_to_v24f16_scalar: ; VI: ; %bb.0: @@ -6677,66 +6356,30 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v24f16_to_v12f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 +; SI-NEXT: v_mov_b32_e32 v24, v11 +; SI-NEXT: v_mov_b32_e32 v13, v10 +; SI-NEXT: v_mov_b32_e32 v14, v9 +; SI-NEXT: v_mov_b32_e32 v15, v8 +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v17, v6 +; SI-NEXT: v_mov_b32_e32 v18, v5 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v20, v3 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v22, v1 +; SI-NEXT: v_mov_b32_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -6749,30 +6392,54 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB34_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v31, v2 -; SI-NEXT: v_or_b32_e32 v3, v29, v3 -; SI-NEXT: v_or_b32_e32 v4, v27, v4 -; SI-NEXT: v_or_b32_e32 v5, v25, v5 -; SI-NEXT: v_or_b32_e32 v6, v23, v6 -; SI-NEXT: v_or_b32_e32 v7, v21, v7 -; SI-NEXT: v_or_b32_e32 v8, v19, v8 -; SI-NEXT: v_or_b32_e32 v9, v17, v9 -; SI-NEXT: v_or_b32_e32 v10, v15, v10 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v27 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 @@ -6785,25 +6452,13 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: .LBB34_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -6816,10 +6471,10 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -6828,25 +6483,25 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v18 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -6854,11 +6509,11 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -6866,11 +6521,11 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -6878,12 +6533,12 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -7021,99 +6676,63 @@ define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, ; SI-LABEL: bitcast_v24f16_to_v12f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s6, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: s_lshr_b32 s6, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: s_lshr_b32 s6, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: s_lshr_b32 s6, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: s_lshr_b32 s6, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: s_lshr_b32 s6, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: s_lshr_b32 s6, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: s_lshr_b32 s6, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: s_lshr_b32 s6, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: s_lshr_b32 s5, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 -; SI-NEXT: s_lshr_b32 s6, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s26, 16 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s24, 16 +; SI-NEXT: s_lshr_b32 s44, s23, 16 +; SI-NEXT: s_lshr_b32 s45, s22, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s47, s20, 16 +; SI-NEXT: s_lshr_b32 s56, s19, 16 +; SI-NEXT: s_lshr_b32 s57, s18, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s28, 0 -; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: s_cbranch_scc0 .LBB35_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v30, v2 -; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_or_b32_e32 v4, v26, v4 -; SI-NEXT: v_or_b32_e32 v5, v24, v5 -; SI-NEXT: v_or_b32_e32 v6, v22, v6 -; SI-NEXT: v_or_b32_e32 v7, v20, v7 -; SI-NEXT: v_or_b32_e32 v8, v18, v8 -; SI-NEXT: v_or_b32_e32 v9, v15, v9 -; SI-NEXT: v_or_b32_e32 v10, v14, v10 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s58, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s57, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s56, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s47, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s46, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s45, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s44, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s43, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s15, s41, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s27, 0xffff +; SI-NEXT: s_lshl_b32 s28, s40, 16 +; SI-NEXT: s_or_b32 s15, s15, s28 +; SI-NEXT: s_cbranch_execnz .LBB35_4 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -7122,41 +6741,41 @@ define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s56 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s47 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s46 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -7164,11 +6783,11 @@ define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s44 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s43 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -7176,11 +6795,11 @@ define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -7188,29 +6807,42 @@ define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s41 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB35_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: .LBB35_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; SI-NEXT: s_branch .LBB35_2 +; SI-NEXT: .LBB35_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: v_mov_b32_e32 v11, s15 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f16_to_v12f32_scalar: ; VI: ; %bb.0: @@ -8955,76 +8587,34 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v18, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 @@ -9033,94 +8623,58 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v18, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v25 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v14 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_or_b32_e32 v2, v2, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v18 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f64_to_v24f16: @@ -9181,185 +8735,139 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = fadd <6 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <6 x double> %a1 to <24 x half> - br label %end - -cmp.false: - %a3 = bitcast <6 x double> %a to <24 x half> - br label %end - -end: - %phi = phi <24 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <24 x half> %phi -} - -define inreg <24 x half> @bitcast_v6f64_to_v24f16_scalar(<6 x double> inreg %a, i32 inreg %b) { -; SI-LABEL: bitcast_v6f64_to_v24f16_scalar: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s28, 0 -; SI-NEXT: s_cbranch_scc0 .LBB45_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: s_cbranch_execnz .LBB45_3 -; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[0:1], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[5:6], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[13:14], s[26:27], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v2, v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 -; SI-NEXT: v_or_b32_e32 v4, v18, v4 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; SI-NEXT: v_or_b32_e32 v5, v5, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; SI-NEXT: v_or_b32_e32 v7, v7, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v9, v14, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v3, v22, v3 -; SI-NEXT: v_or_b32_e32 v6, v17, v6 -; SI-NEXT: v_or_b32_e32 v8, v15, v8 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr10 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <6 x double> %a1 to <24 x half> + br label %end + +cmp.false: + %a3 = bitcast <6 x double> %a to <24 x half> + br label %end + +end: + %phi = phi <24 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x half> %phi +} + +define inreg <24 x half> @bitcast_v6f64_to_v24f16_scalar(<6 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f64_to_v24f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB45_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s44, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s23, 16 +; SI-NEXT: s_lshr_b32 s42, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB45_4 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_lshr_b64 v[12:13], v[10:11], 16 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_lshr_b64 v[13:14], v[8:9], 16 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[14:15], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[15:16], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: s_branch .LBB45_5 +; SI-NEXT: .LBB45_3: +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: s_branch .LBB45_2 +; SI-NEXT: .LBB45_4: +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v19, s45 +; SI-NEXT: v_mov_b32_e32 v20, s44 +; SI-NEXT: v_mov_b32_e32 v21, s43 +; SI-NEXT: v_mov_b32_e32 v22, s42 +; SI-NEXT: v_mov_b32_e32 v23, s41 +; SI-NEXT: v_mov_b32_e32 v24, s40 +; SI-NEXT: v_mov_b32_e32 v17, s14 +; SI-NEXT: v_mov_b32_e32 v16, s12 +; SI-NEXT: v_mov_b32_e32 v15, s10 +; SI-NEXT: v_mov_b32_e32 v14, s8 +; SI-NEXT: v_mov_b32_e32 v13, s6 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: .LBB45_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f64_to_v24f16_scalar: ; VI: ; %bb.0: @@ -9487,66 +8995,30 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v24f16_to_v6f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v11 +; SI-NEXT: v_mov_b32_e32 v16, v11 +; SI-NEXT: v_mov_b32_e32 v17, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v19, v8 +; SI-NEXT: v_mov_b32_e32 v20, v7 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v22, v5 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v24, v3 +; SI-NEXT: v_mov_b32_e32 v25, v2 +; SI-NEXT: v_mov_b32_e32 v26, v1 +; SI-NEXT: v_mov_b32_e32 v27, v0 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v27 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -9559,42 +9031,42 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB46_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v30, v4 -; SI-NEXT: v_or_b32_e32 v5, v28, v5 -; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v24, v7 -; SI-NEXT: v_or_b32_e32 v8, v22, v8 -; SI-NEXT: v_or_b32_e32 v9, v20, v9 -; SI-NEXT: v_or_b32_e32 v10, v18, v10 -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 @@ -9607,13 +9079,25 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -9626,10 +9110,10 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -9638,25 +9122,25 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -9664,11 +9148,11 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -9676,11 +9160,11 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -9688,11 +9172,11 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 @@ -9831,142 +9315,118 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; SI-LABEL: bitcast_v24f16_to_v6f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s6, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: s_lshr_b32 s6, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: s_lshr_b32 s6, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: s_lshr_b32 s6, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: s_lshr_b32 s6, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: s_lshr_b32 s6, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: s_lshr_b32 s6, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: s_lshr_b32 s6, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: s_lshr_b32 s6, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: s_lshr_b32 s5, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 -; SI-NEXT: s_lshr_b32 s6, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s36, 0 +; SI-NEXT: v_writelane_b32 v16, s37, 1 +; SI-NEXT: v_writelane_b32 v16, s38, 2 +; SI-NEXT: v_writelane_b32 v16, s39, 3 +; SI-NEXT: v_writelane_b32 v16, s48, 4 +; SI-NEXT: v_writelane_b32 v16, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s56, s16, 16 +; SI-NEXT: v_writelane_b32 v16, s50, 6 ; SI-NEXT: s_cmp_lg_u32 s28, 0 -; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: v_writelane_b32 v16, s51, 7 +; SI-NEXT: s_cbranch_scc0 .LBB47_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v30, v4 -; SI-NEXT: v_or_b32_e32 v5, v28, v5 -; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v24, v7 -; SI-NEXT: v_or_b32_e32 v8, v22, v8 -; SI-NEXT: v_or_b32_e32 v9, v19, v9 -; SI-NEXT: v_or_b32_e32 v10, v18, v10 -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s37, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB47_4 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -9974,11 +9434,11 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -9986,11 +9446,11 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -9998,12 +9458,12 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -10016,11 +9476,41 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB47_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB47_5 +; SI-NEXT: .LBB47_3: +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB47_2 +; SI-NEXT: .LBB47_4: +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_mov_b32_e32 v14, s50 +; SI-NEXT: v_mov_b32_e32 v15, s51 +; SI-NEXT: .LBB47_5: ; %end +; SI-NEXT: v_readlane_b32 s51, v16, 7 +; SI-NEXT: v_readlane_b32 s50, v16, 6 +; SI-NEXT: v_readlane_b32 s49, v16, 5 +; SI-NEXT: v_readlane_b32 s48, v16, 4 +; SI-NEXT: v_readlane_b32 s39, v16, 3 +; SI-NEXT: v_readlane_b32 s38, v16, 2 +; SI-NEXT: v_readlane_b32 s37, v16, 1 +; SI-NEXT: v_readlane_b32 s36, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f16_to_v6f64_scalar: ; VI: ; %bb.0: @@ -11261,82 +10751,34 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 @@ -11353,92 +10795,56 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v13, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v25 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v14 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_or_b32_e32 v2, v2, v17 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v17 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i64_to_v24f16: @@ -11543,167 +10949,107 @@ define inreg <24 x half> @bitcast_v6i64_to_v24f16_scalar(<6 x i64> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s28, 0 ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: s_lshr_b32 s7, s5, 16 -; SI-NEXT: s_add_u32 s8, s18, 3 -; SI-NEXT: s_addc_u32 s9, s19, 0 -; SI-NEXT: s_lshr_b32 s10, s8, 16 -; SI-NEXT: s_lshr_b32 s11, s9, 16 -; SI-NEXT: s_add_u32 s12, s20, 3 -; SI-NEXT: s_addc_u32 s13, s21, 0 -; SI-NEXT: s_lshr_b32 s14, s12, 16 -; SI-NEXT: s_lshr_b32 s15, s13, 16 -; SI-NEXT: s_add_u32 s16, s22, 3 -; SI-NEXT: s_addc_u32 s17, s23, 0 -; SI-NEXT: s_lshr_b32 s18, s16, 16 -; SI-NEXT: s_lshr_b32 s19, s17, 16 -; SI-NEXT: s_add_u32 s20, s24, 3 -; SI-NEXT: s_addc_u32 s21, s25, 0 -; SI-NEXT: s_lshr_b32 s22, s20, 16 -; SI-NEXT: s_lshr_b32 s23, s21, 16 -; SI-NEXT: s_add_u32 s24, s26, 3 -; SI-NEXT: s_addc_u32 s25, s27, 0 -; SI-NEXT: s_lshr_b32 s26, s24, 16 -; SI-NEXT: s_lshr_b32 s27, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_or_b32_e32 v0, v22, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_or_b32_e32 v2, v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; SI-NEXT: v_or_b32_e32 v5, v5, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; SI-NEXT: v_or_b32_e32 v7, v7, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v9, v14, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v14 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v3, v22, v3 -; SI-NEXT: v_or_b32_e32 v4, v19, v4 -; SI-NEXT: v_or_b32_e32 v6, v17, v6 -; SI-NEXT: v_or_b32_e32 v8, v15, v8 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s14, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s45, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s12, 16 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s12, s44, 16 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s12, s20, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s10, s12, s10 +; SI-NEXT: s_and_b32 s12, s21, 0xffff +; SI-NEXT: s_lshl_b32 s13, s43, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s13, s8 +; SI-NEXT: s_and_b32 s13, s23, 0xffff +; SI-NEXT: s_lshl_b32 s14, s42, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s24, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s14, s6 +; SI-NEXT: s_and_b32 s14, s25, 0xffff +; SI-NEXT: s_lshl_b32 s15, s41, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s26, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s15, s4 +; SI-NEXT: s_and_b32 s15, s27, 0xffff +; SI-NEXT: s_lshl_b32 s16, s40, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s12 +; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: v_mov_b32_e32 v11, s15 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v6i64_to_v24f16_scalar: @@ -11834,66 +11180,30 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v24f16_to_v6i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v11 +; SI-NEXT: v_mov_b32_e32 v16, v11 +; SI-NEXT: v_mov_b32_e32 v17, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v19, v8 +; SI-NEXT: v_mov_b32_e32 v20, v7 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v22, v5 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v24, v3 +; SI-NEXT: v_mov_b32_e32 v25, v2 +; SI-NEXT: v_mov_b32_e32 v26, v1 +; SI-NEXT: v_mov_b32_e32 v27, v0 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v27 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -11906,42 +11216,42 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB54_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v30, v4 -; SI-NEXT: v_or_b32_e32 v5, v28, v5 -; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v24, v7 -; SI-NEXT: v_or_b32_e32 v8, v22, v8 -; SI-NEXT: v_or_b32_e32 v9, v20, v9 -; SI-NEXT: v_or_b32_e32 v10, v18, v10 -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 @@ -11954,13 +11264,25 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: .LBB54_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -11973,10 +11295,10 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -11985,25 +11307,25 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -12011,11 +11333,11 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -12023,11 +11345,11 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -12035,11 +11357,11 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 @@ -12178,142 +11500,118 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; SI-LABEL: bitcast_v24f16_to_v6i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s6, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: s_lshr_b32 s6, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: s_lshr_b32 s6, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: s_lshr_b32 s6, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: s_lshr_b32 s6, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: s_lshr_b32 s6, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: s_lshr_b32 s6, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: s_lshr_b32 s6, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: s_lshr_b32 s6, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: s_lshr_b32 s5, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s6 -; SI-NEXT: s_lshr_b32 s6, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s36, 0 +; SI-NEXT: v_writelane_b32 v16, s37, 1 +; SI-NEXT: v_writelane_b32 v16, s38, 2 +; SI-NEXT: v_writelane_b32 v16, s39, 3 +; SI-NEXT: v_writelane_b32 v16, s48, 4 +; SI-NEXT: v_writelane_b32 v16, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: s_lshr_b32 s15, s18, 16 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s56, s16, 16 +; SI-NEXT: v_writelane_b32 v16, s50, 6 ; SI-NEXT: s_cmp_lg_u32 s28, 0 -; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: v_writelane_b32 v16, s51, 7 +; SI-NEXT: s_cbranch_scc0 .LBB55_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v30, v4 -; SI-NEXT: v_or_b32_e32 v5, v28, v5 -; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v24, v7 -; SI-NEXT: v_or_b32_e32 v8, v22, v8 -; SI-NEXT: v_or_b32_e32 v9, v19, v9 -; SI-NEXT: v_or_b32_e32 v10, v18, v10 -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s37, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB55_4 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -12321,11 +11619,11 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -12333,11 +11631,11 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -12345,12 +11643,12 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -12363,11 +11661,41 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB55_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB55_5 +; SI-NEXT: .LBB55_3: +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB55_2 +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_mov_b32_e32 v14, s50 +; SI-NEXT: v_mov_b32_e32 v15, s51 +; SI-NEXT: .LBB55_5: ; %end +; SI-NEXT: v_readlane_b32 s51, v16, 7 +; SI-NEXT: v_readlane_b32 s50, v16, 6 +; SI-NEXT: v_readlane_b32 s49, v16, 5 +; SI-NEXT: v_readlane_b32 s48, v16, 4 +; SI-NEXT: v_readlane_b32 s39, v16, 3 +; SI-NEXT: v_readlane_b32 s38, v16, 2 +; SI-NEXT: v_readlane_b32 s37, v16, 1 +; SI-NEXT: v_readlane_b32 s36, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f16_to_v6i64_scalar: ; VI: ; %bb.0: @@ -12563,71 +11891,83 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v24i16_to_v24f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v27, v1, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v25, v1, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v35, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v22, v1, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v30, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v19, v1, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v26, v0, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v16, v1, v53 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v24, v0, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v12, v1, v55 +; SI-NEXT: v_or_b32_e32 v21, v0, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_alignbit_b32 v28, v27, v36, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v37, 16 +; SI-NEXT: v_alignbit_b32 v31, v22, v39, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v50, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v52, 16 +; SI-NEXT: v_alignbit_b32 v34, v12, v54, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v54 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -12640,120 +11980,121 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v52 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v50 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v48 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v38 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v36 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v8, v52, v8 +; SI-NEXT: v_or_b32_e32 v6, v50, v6 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v10, v55, v10 +; SI-NEXT: v_or_b32_e32 v8, v53, v8 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: v_or_b32_e32 v4, v49, v4 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v28, v27, v35, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v30, 16 +; SI-NEXT: v_alignbit_b32 v31, v22, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v21, 16 +; SI-NEXT: v_alignbit_b32 v34, v12, v18, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v31 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v27 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v17 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v33 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i16_to_v24f16: @@ -12877,170 +12218,206 @@ define inreg <24 x half> @bitcast_v24i16_to_v24f16_scalar(<24 x i16> inreg %a, i ; SI-LABEL: bitcast_v24i16_to_v24f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s40, s27, 16 -; SI-NEXT: s_lshr_b32 s29, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s73, s27, 16 +; SI-NEXT: s_lshr_b32 s79, s26, 16 +; SI-NEXT: s_lshr_b32 s72, s25, 16 +; SI-NEXT: s_lshr_b32 s78, s24, 16 +; SI-NEXT: s_lshr_b32 s63, s23, 16 +; SI-NEXT: s_lshr_b32 s77, s22, 16 +; SI-NEXT: s_lshr_b32 s62, s21, 16 +; SI-NEXT: s_lshr_b32 s76, s20, 16 +; SI-NEXT: s_lshr_b32 s61, s19, 16 +; SI-NEXT: s_lshr_b32 s75, s18, 16 +; SI-NEXT: s_lshr_b32 s60, s17, 16 +; SI-NEXT: s_lshr_b32 s74, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s28, 0 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s60, 16 +; SI-NEXT: s_or_b32 s13, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s61, 16 +; SI-NEXT: s_or_b32 s15, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s62, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s12, s74, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s63, 16 +; SI-NEXT: s_or_b32 s10, s4, s12 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s14, s75, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s72, 16 +; SI-NEXT: s_or_b32 s8, s4, s14 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s42, s76, 16 +; SI-NEXT: s_lshl_b32 s46, s78, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s7, s73, 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[12:13], 16 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_or_b32 s6, s4, s42 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s44, s77, 16 +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_lshl_b32 s56, s79, 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 16 +; SI-NEXT: s_or_b32 s14, s12, s46 +; SI-NEXT: s_and_b32 s12, s26, 0xffff +; SI-NEXT: s_or_b32 s4, s4, s44 +; SI-NEXT: s_mov_b32 s11, s13 +; SI-NEXT: s_mov_b32 s9, s15 +; SI-NEXT: s_mov_b32 s7, s43 +; SI-NEXT: s_lshr_b64 s[42:43], s[42:43], 16 +; SI-NEXT: s_mov_b32 s5, s45 +; SI-NEXT: s_lshr_b64 s[44:45], s[44:45], 16 +; SI-NEXT: s_mov_b32 s15, s47 +; SI-NEXT: s_lshr_b64 s[46:47], s[46:47], 16 +; SI-NEXT: s_or_b32 s12, s12, s56 +; SI-NEXT: s_mov_b32 s13, s57 +; SI-NEXT: s_lshr_b64 s[56:57], s[56:57], 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s79, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s12, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s13, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s78, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s14, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s15, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s77, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s6, s63, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s76, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s21, 0xffff +; SI-NEXT: s_lshl_b32 s8, s62, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s18, 0xffff +; SI-NEXT: s_lshl_b32 s9, s75, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s19, 0xffff +; SI-NEXT: s_lshl_b32 s10, s61, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v0, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 -; SI-NEXT: v_or_b32_e32 v1, v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v2, v2, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s16, 0xffff +; SI-NEXT: s_lshl_b32 s11, s74, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s60, 16 +; SI-NEXT: s_or_b32 s11, s16, s11 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 16 +; SI-NEXT: s_lshr_b32 s60, s11, 16 +; SI-NEXT: s_lshr_b32 s61, s9, 16 +; SI-NEXT: s_lshr_b32 s62, s7, 16 +; SI-NEXT: s_lshr_b32 s63, s5, 16 +; SI-NEXT: s_lshr_b32 s72, s15, 16 +; SI-NEXT: s_lshr_b32 s73, s13, 16 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s16, s28, 16 +; SI-NEXT: s_or_b32 s10, s10, s16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s16, s60, 16 +; SI-NEXT: s_or_b32 s11, s11, s16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s16, s40, 16 +; SI-NEXT: s_or_b32 s8, s8, s16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s16, s61, 16 +; SI-NEXT: s_or_b32 s9, s9, s16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_or_b32 s6, s6, s16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s16, s62, 16 +; SI-NEXT: s_or_b32 s7, s7, s16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s16, s44, 16 +; SI-NEXT: s_or_b32 s4, s4, s16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s16, s63, 16 +; SI-NEXT: s_or_b32 s5, s5, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s46, 16 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s16, s72, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s16, s56, 16 +; SI-NEXT: s_or_b32 s12, s12, s16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s16, s73, 16 +; SI-NEXT: s_or_b32 s13, s13, s16 +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: v_mov_b32_e32 v7, s5 +; SI-NEXT: v_mov_b32_e32 v8, s14 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s12 +; SI-NEXT: v_mov_b32_e32 v11, s13 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v24i16_to_v24f16_scalar: @@ -13230,211 +12607,163 @@ define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v24f16_to_v24i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_or_b32_e32 v11, v11, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_or_b32_e32 v9, v9, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_or_b32_e32 v7, v7, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v5, v5, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 ; SI-NEXT: v_or_b32_e32 v3, v3, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v20 ; SI-NEXT: v_or_b32_e32 v1, v1, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v0, v0, v23 -; SI-NEXT: v_or_b32_e32 v18, v18, v22 -; SI-NEXT: v_or_b32_e32 v16, v16, v21 -; SI-NEXT: v_or_b32_e32 v14, v14, v20 -; SI-NEXT: v_or_b32_e32 v15, v15, v19 -; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v18 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 ; SI-NEXT: v_alignbit_b32 v23, v1, v23, 16 ; SI-NEXT: v_alignbit_b32 v22, v3, v22, 16 ; SI-NEXT: v_alignbit_b32 v21, v5, v21, 16 -; SI-NEXT: v_alignbit_b32 v20, v7, v20, 16 -; SI-NEXT: v_alignbit_b32 v19, v9, v19, 16 -; SI-NEXT: v_alignbit_b32 v17, v11, v17, 16 +; SI-NEXT: v_alignbit_b32 v19, v7, v19, 16 +; SI-NEXT: v_alignbit_b32 v18, v9, v18, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v15, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v21 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_or_b32_e32 v6, v6, v12 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v18 +; SI-NEXT: v_or_b32_e32 v8, v8, v12 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 -; SI-NEXT: v_or_b32_e32 v6, v6, v14 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; SI-NEXT: v_or_b32_e32 v0, v0, v23 -; SI-NEXT: v_or_b32_e32 v2, v2, v18 -; SI-NEXT: v_or_b32_e32 v4, v4, v16 -; SI-NEXT: v_or_b32_e32 v8, v8, v14 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -13560,213 +12889,191 @@ define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i ; SI-LABEL: bitcast_v24f16_to_v24i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: s_lshr_b32 s5, s23, 16 -; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s27, 16 +; SI-NEXT: s_lshr_b32 s8, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s14, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 ; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_lshr_b32 s5, s22, 16 -; SI-NEXT: s_lshr_b32 s6, s20, 16 -; SI-NEXT: s_lshr_b32 s7, s18, 16 -; SI-NEXT: s_lshr_b32 s8, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 +; SI-NEXT: s_lshr_b32 s29, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s28, 0 -; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: s_cbranch_scc0 .LBB59_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: s_cbranch_execnz .LBB59_4 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s23 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s13 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v7, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_or_b32_e32 v5, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 +; SI-NEXT: v_or_b32_e32 v3, v11, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 -; SI-NEXT: v_or_b32_e32 v34, v12, v0 -; SI-NEXT: v_or_b32_e32 v32, v13, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s20 +; SI-NEXT: v_or_b32_e32 v34, v13, v0 +; SI-NEXT: v_or_b32_e32 v32, v14, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s26 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v35, v13, v4 -; SI-NEXT: v_or_b32_e32 v33, v12, v6 -; SI-NEXT: v_or_b32_e32 v30, v14, v8 -; SI-NEXT: v_or_b32_e32 v28, v15, v10 +; SI-NEXT: v_or_b32_e32 v35, v12, v4 +; SI-NEXT: v_or_b32_e32 v33, v13, v6 +; SI-NEXT: v_or_b32_e32 v31, v14, v8 +; SI-NEXT: v_or_b32_e32 v30, v15, v10 ; SI-NEXT: v_lshr_b64 v[22:23], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[20:21], v[2:3], 16 ; SI-NEXT: v_lshr_b64 v[18:19], v[4:5], 16 ; SI-NEXT: v_lshr_b64 v[16:17], v[6:7], 16 ; SI-NEXT: v_lshr_b64 v[14:15], v[8:9], 16 ; SI-NEXT: v_lshr_b64 v[12:13], v[10:11], 16 -; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: s_branch .LBB59_5 +; SI-NEXT: .LBB59_3: +; SI-NEXT: s_branch .LBB59_2 +; SI-NEXT: .LBB59_4: +; SI-NEXT: v_mov_b32_e32 v24, s13 +; SI-NEXT: v_mov_b32_e32 v28, s10 +; SI-NEXT: v_mov_b32_e32 v25, s12 +; SI-NEXT: v_mov_b32_e32 v26, s11 +; SI-NEXT: v_mov_b32_e32 v27, s7 +; SI-NEXT: v_mov_b32_e32 v29, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v30, s26 +; SI-NEXT: v_mov_b32_e32 v31, s24 +; SI-NEXT: v_mov_b32_e32 v33, s22 +; SI-NEXT: v_mov_b32_e32 v35, s20 +; SI-NEXT: v_mov_b32_e32 v32, s18 +; SI-NEXT: v_mov_b32_e32 v34, s16 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v22, s40 +; SI-NEXT: v_mov_b32_e32 v20, s29 +; SI-NEXT: v_mov_b32_e32 v18, s15 +; SI-NEXT: v_mov_b32_e32 v16, s14 +; SI-NEXT: v_mov_b32_e32 v14, s9 +; SI-NEXT: v_mov_b32_e32 v12, s8 +; SI-NEXT: .LBB59_5: ; %end ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v34 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v35 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 ; SI-NEXT: v_or_b32_e32 v6, v6, v8 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v31 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 ; SI-NEXT: v_or_b32_e32 v8, v8, v10 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v30 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v10, v10, v12 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v24f16_to_v24i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll index 95359d8ae8f72..8e5490d7eeafc 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll @@ -2978,212 +2978,114 @@ define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v23, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v23, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v31 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i32_to_v28f16: @@ -3292,196 +3194,126 @@ define inreg <28 x half> @bitcast_v14i32_to_v28f16_scalar(<14 x i32> inreg %a, i ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 +; SI-NEXT: s_lshr_b32 s44, s29, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s19, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: s_lshr_b32 s5, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s44, s29, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s19, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_or_b32_e32 v0, v26, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v2, v24, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 -; SI-NEXT: v_or_b32_e32 v5, v5, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; SI-NEXT: v_or_b32_e32 v7, v7, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 -; SI-NEXT: v_or_b32_e32 v9, v18, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 -; SI-NEXT: v_or_b32_e32 v4, v23, v4 -; SI-NEXT: v_or_b32_e32 v6, v21, v6 -; SI-NEXT: v_or_b32_e32 v8, v19, v8 -; SI-NEXT: v_or_b32_e32 v10, v17, v10 -; SI-NEXT: v_or_b32_e32 v12, v15, v12 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: s_branch .LBB17_2 -; -; VI-LABEL: bitcast_v14i32_to_v28f16_scalar: +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s40, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s58, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s14, 16 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s13, s57, 16 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_and_b32 s13, s20, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s56, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s22, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s10, s14, s10 +; SI-NEXT: s_and_b32 s14, s23, 0xffff +; SI-NEXT: s_lshl_b32 s15, s47, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s24, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s15, s8 +; SI-NEXT: s_and_b32 s15, s25, 0xffff +; SI-NEXT: s_lshl_b32 s16, s46, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_and_b32 s16, s26, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s16, s6 +; SI-NEXT: s_and_b32 s16, s27, 0xffff +; SI-NEXT: s_lshl_b32 s17, s45, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s28, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s17, s4 +; SI-NEXT: s_and_b32 s17, s29, 0xffff +; SI-NEXT: s_lshl_b32 s18, s44, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s6 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v14i32_to_v28f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 @@ -3622,76 +3454,34 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v28f16_to_v14i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v17, v12 +; SI-NEXT: v_mov_b32_e32 v18, v11 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v7 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v27, v2 +; SI-NEXT: v_mov_b32_e32 v28, v1 +; SI-NEXT: v_mov_b32_e32 v29, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3704,48 +3494,48 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v24, v9 -; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v32 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 @@ -3760,13 +3550,27 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3774,10 +3578,10 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -3785,11 +3589,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v39 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -3797,11 +3601,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -3809,11 +3613,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -3821,11 +3625,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v22 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v21 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -3833,11 +3637,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -3845,11 +3649,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -3857,11 +3661,11 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 @@ -4010,114 +3814,84 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-LABEL: bitcast_v28f16_to_v14i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; SI-NEXT: s_lshr_b32 s8, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 -; SI-NEXT: s_lshr_b32 s8, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: s_lshr_b32 s8, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: s_lshr_b32 s8, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 -; SI-NEXT: s_lshr_b32 s8, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: s_lshr_b32 s8, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: s_lshr_b32 s8, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s36, 0 +; SI-NEXT: v_writelane_b32 v16, s37, 1 +; SI-NEXT: v_writelane_b32 v16, s38, 2 +; SI-NEXT: v_writelane_b32 v16, s39, 3 +; SI-NEXT: v_writelane_b32 v16, s48, 4 +; SI-NEXT: v_writelane_b32 v16, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s56, s19, 16 +; SI-NEXT: s_lshr_b32 s57, s18, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_writelane_b32 v16, s50, 6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: v_writelane_b32 v16, s51, 7 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v23, v9 -; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v19, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_and_b32 s40, s17, 0xffff +; SI-NEXT: s_lshl_b32 s41, s58, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s37, s40, s41 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s48, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s49, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4125,10 +3899,10 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -4136,11 +3910,11 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s56 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -4148,11 +3922,11 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -4160,11 +3934,11 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -4172,11 +3946,11 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -4184,11 +3958,11 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -4196,11 +3970,11 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -4208,12 +3982,12 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -4226,11 +4000,41 @@ define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: .LBB19_3: ; %end -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB19_5 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_mov_b32_e32 v14, s50 +; SI-NEXT: v_mov_b32_e32 v15, s51 +; SI-NEXT: .LBB19_5: ; %end +; SI-NEXT: v_readlane_b32 s51, v16, 7 +; SI-NEXT: v_readlane_b32 s50, v16, 6 +; SI-NEXT: v_readlane_b32 s49, v16, 5 +; SI-NEXT: v_readlane_b32 s48, v16, 4 +; SI-NEXT: v_readlane_b32 s39, v16, 3 +; SI-NEXT: v_readlane_b32 s38, v16, 2 +; SI-NEXT: v_readlane_b32 s37, v16, 1 +; SI-NEXT: v_readlane_b32 s36, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v14i32_scalar: ; VI: ; %bb.0: @@ -6825,212 +6629,114 @@ define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v23, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v23, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v31 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f32_to_v28f16: @@ -7130,196 +6836,142 @@ define inreg <28 x half> @bitcast_v14f32_to_v28f16_scalar(<14 x float> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: s_cbranch_scc0 .LBB33_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 -; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: s_lshr_b32 s58, s29, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_lshr_b32 s56, s25, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s45, s19, 16 +; SI-NEXT: s_lshr_b32 s44, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB33_4 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_lshr_b64 v[14:15], v[12:13], 16 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 ; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s28, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s29, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_or_b32_e32 v0, v26, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v2, v24, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 -; SI-NEXT: v_or_b32_e32 v5, v5, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; SI-NEXT: v_or_b32_e32 v7, v7, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 -; SI-NEXT: v_or_b32_e32 v9, v18, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 -; SI-NEXT: v_or_b32_e32 v4, v23, v4 -; SI-NEXT: v_or_b32_e32 v6, v21, v6 -; SI-NEXT: v_or_b32_e32 v8, v19, v8 -; SI-NEXT: v_or_b32_e32 v10, v17, v10 -; SI-NEXT: v_or_b32_e32 v12, v15, v12 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_lshr_b64 v[15:16], v[10:11], 16 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_lshr_b64 v[16:17], v[8:9], 16 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_lshr_b64 v[17:18], v[6:7], 16 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[18:19], v[4:5], 16 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[19:20], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; SI-NEXT: s_branch .LBB33_5 +; SI-NEXT: .LBB33_3: +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: s_branch .LBB33_2 +; SI-NEXT: .LBB33_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v28, s44 +; SI-NEXT: v_mov_b32_e32 v27, s45 +; SI-NEXT: v_mov_b32_e32 v26, s46 +; SI-NEXT: v_mov_b32_e32 v25, s47 +; SI-NEXT: v_mov_b32_e32 v24, s56 +; SI-NEXT: v_mov_b32_e32 v23, s57 +; SI-NEXT: v_mov_b32_e32 v22, s58 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mov_b32_e32 v16, s8 +; SI-NEXT: v_mov_b32_e32 v17, s10 +; SI-NEXT: v_mov_b32_e32 v18, s12 +; SI-NEXT: v_mov_b32_e32 v19, s14 +; SI-NEXT: v_mov_b32_e32 v20, s40 +; SI-NEXT: .LBB33_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v2, v2, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v23 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f32_to_v28f16_scalar: ; VI: ; %bb.0: @@ -7473,76 +7125,34 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v28f16_to_v14f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v17, v12 +; SI-NEXT: v_mov_b32_e32 v18, v11 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v7 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v27, v2 +; SI-NEXT: v_mov_b32_e32 v28, v1 +; SI-NEXT: v_mov_b32_e32 v29, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -7555,48 +7165,48 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB34_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v24, v9 -; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v32 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 @@ -7611,13 +7221,27 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: .LBB34_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -7625,10 +7249,10 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -7636,11 +7260,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v39 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -7648,11 +7272,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -7660,11 +7284,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -7672,11 +7296,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v22 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v21 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -7684,11 +7308,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -7696,11 +7320,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -7708,11 +7332,11 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 @@ -7861,114 +7485,84 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-LABEL: bitcast_v28f16_to_v14f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; SI-NEXT: s_lshr_b32 s8, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 -; SI-NEXT: s_lshr_b32 s8, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: s_lshr_b32 s8, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: s_lshr_b32 s8, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 -; SI-NEXT: s_lshr_b32 s8, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: s_lshr_b32 s8, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: s_lshr_b32 s8, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s36, 0 +; SI-NEXT: v_writelane_b32 v16, s37, 1 +; SI-NEXT: v_writelane_b32 v16, s38, 2 +; SI-NEXT: v_writelane_b32 v16, s39, 3 +; SI-NEXT: v_writelane_b32 v16, s48, 4 +; SI-NEXT: v_writelane_b32 v16, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s56, s19, 16 +; SI-NEXT: s_lshr_b32 s57, s18, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_writelane_b32 v16, s50, 6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: v_writelane_b32 v16, s51, 7 +; SI-NEXT: s_cbranch_scc0 .LBB35_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v23, v9 -; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v19, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_and_b32 s40, s17, 0xffff +; SI-NEXT: s_lshl_b32 s41, s58, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s37, s40, s41 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s48, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s49, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB35_4 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -7976,10 +7570,10 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -7987,11 +7581,11 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s56 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -7999,11 +7593,11 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -8011,11 +7605,11 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -8023,11 +7617,11 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -8035,11 +7629,11 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -8047,11 +7641,11 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -8059,12 +7653,12 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -8077,11 +7671,41 @@ define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: .LBB35_3: ; %end -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB35_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB35_5 +; SI-NEXT: .LBB35_3: +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB35_2 +; SI-NEXT: .LBB35_4: +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_mov_b32_e32 v14, s50 +; SI-NEXT: v_mov_b32_e32 v15, s51 +; SI-NEXT: .LBB35_5: ; %end +; SI-NEXT: v_readlane_b32 s51, v16, 7 +; SI-NEXT: v_readlane_b32 s50, v16, 6 +; SI-NEXT: v_readlane_b32 s49, v16, 5 +; SI-NEXT: v_readlane_b32 s48, v16, 4 +; SI-NEXT: v_readlane_b32 s39, v16, 3 +; SI-NEXT: v_readlane_b32 s38, v16, 2 +; SI-NEXT: v_readlane_b32 s37, v16, 1 +; SI-NEXT: v_readlane_b32 s36, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v14f32_scalar: ; VI: ; %bb.0: @@ -10033,94 +9657,38 @@ define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v23, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 @@ -10139,106 +9707,64 @@ define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v18, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v23, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v31 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v20 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i64_to_v28f16: @@ -10351,193 +9877,123 @@ define inreg <28 x half> @bitcast_v7i64_to_v28f16_scalar(<7 x i64> inreg %a, i32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 +; SI-NEXT: s_lshr_b32 s44, s29, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s19, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: s_lshr_b32 s7, s5, 16 -; SI-NEXT: s_add_u32 s8, s18, 3 -; SI-NEXT: s_addc_u32 s9, s19, 0 -; SI-NEXT: s_lshr_b32 s10, s8, 16 -; SI-NEXT: s_lshr_b32 s11, s9, 16 -; SI-NEXT: s_add_u32 s12, s20, 3 -; SI-NEXT: s_addc_u32 s13, s21, 0 -; SI-NEXT: s_lshr_b32 s14, s12, 16 -; SI-NEXT: s_lshr_b32 s15, s13, 16 -; SI-NEXT: s_add_u32 s16, s22, 3 -; SI-NEXT: s_addc_u32 s17, s23, 0 -; SI-NEXT: s_lshr_b32 s18, s16, 16 -; SI-NEXT: s_lshr_b32 s19, s17, 16 -; SI-NEXT: s_add_u32 s20, s24, 3 -; SI-NEXT: s_addc_u32 s21, s25, 0 -; SI-NEXT: s_lshr_b32 s22, s20, 16 -; SI-NEXT: s_lshr_b32 s23, s21, 16 -; SI-NEXT: s_add_u32 s24, s26, 3 -; SI-NEXT: s_addc_u32 s25, s27, 0 -; SI-NEXT: s_lshr_b32 s26, s24, 16 -; SI-NEXT: s_lshr_b32 s27, s25, 16 ; SI-NEXT: s_add_u32 s28, s28, 3 ; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_lshr_b32 s40, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s44, s29, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s19, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_or_b32_e32 v0, v26, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_or_b32_e32 v2, v24, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 -; SI-NEXT: v_or_b32_e32 v5, v5, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; SI-NEXT: v_or_b32_e32 v7, v7, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 -; SI-NEXT: v_or_b32_e32 v9, v18, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 -; SI-NEXT: v_or_b32_e32 v4, v23, v4 -; SI-NEXT: v_or_b32_e32 v6, v21, v6 -; SI-NEXT: v_or_b32_e32 v8, v19, v8 -; SI-NEXT: v_or_b32_e32 v10, v17, v10 -; SI-NEXT: v_or_b32_e32 v12, v15, v12 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s40, 16 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshl_b32 s9, s58, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s14, 16 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s13, s57, 16 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_and_b32 s13, s20, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s56, 16 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s14, s22, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s10, s14, s10 +; SI-NEXT: s_and_b32 s14, s23, 0xffff +; SI-NEXT: s_lshl_b32 s15, s47, 16 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s24, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s15, s8 +; SI-NEXT: s_and_b32 s15, s25, 0xffff +; SI-NEXT: s_lshl_b32 s16, s46, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_and_b32 s16, s26, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s16, s6 +; SI-NEXT: s_and_b32 s16, s27, 0xffff +; SI-NEXT: s_lshl_b32 s17, s45, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s28, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s17, s4 +; SI-NEXT: s_and_b32 s17, s29, 0xffff +; SI-NEXT: s_lshl_b32 s18, s44, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s6 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: v_mov_b32_e32 v13, s17 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v7i64_to_v28f16_scalar: @@ -10681,76 +10137,34 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v28f16_to_v7i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v17, v12 +; SI-NEXT: v_mov_b32_e32 v18, v11 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v7 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v27, v2 +; SI-NEXT: v_mov_b32_e32 v28, v1 +; SI-NEXT: v_mov_b32_e32 v29, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -10763,48 +10177,48 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB46_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v24, v9 -; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v32 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 @@ -10819,13 +10233,27 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -10833,10 +10261,10 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -10844,11 +10272,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v39 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -10856,11 +10284,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -10868,11 +10296,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -10880,11 +10308,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v22 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v21 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -10892,11 +10320,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -10904,11 +10332,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -10916,11 +10344,11 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 @@ -11069,114 +10497,84 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-LABEL: bitcast_v28f16_to_v7i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; SI-NEXT: s_lshr_b32 s8, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 -; SI-NEXT: s_lshr_b32 s8, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: s_lshr_b32 s8, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: s_lshr_b32 s8, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 -; SI-NEXT: s_lshr_b32 s8, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: s_lshr_b32 s8, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: s_lshr_b32 s8, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s36, 0 +; SI-NEXT: v_writelane_b32 v16, s37, 1 +; SI-NEXT: v_writelane_b32 v16, s38, 2 +; SI-NEXT: v_writelane_b32 v16, s39, 3 +; SI-NEXT: v_writelane_b32 v16, s48, 4 +; SI-NEXT: v_writelane_b32 v16, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s56, s19, 16 +; SI-NEXT: s_lshr_b32 s57, s18, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_writelane_b32 v16, s50, 6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: v_writelane_b32 v16, s51, 7 +; SI-NEXT: s_cbranch_scc0 .LBB47_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v23, v9 -; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v19, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_and_b32 s40, s17, 0xffff +; SI-NEXT: s_lshl_b32 s41, s58, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s37, s40, s41 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s48, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s49, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB47_4 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -11184,10 +10582,10 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -11195,11 +10593,11 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s56 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -11207,11 +10605,11 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -11219,11 +10617,11 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -11231,11 +10629,11 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -11243,11 +10641,11 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -11255,11 +10653,11 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -11267,12 +10665,12 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -11285,11 +10683,41 @@ define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB47_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB47_5 +; SI-NEXT: .LBB47_3: +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB47_2 +; SI-NEXT: .LBB47_4: +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_mov_b32_e32 v14, s50 +; SI-NEXT: v_mov_b32_e32 v15, s51 +; SI-NEXT: .LBB47_5: ; %end +; SI-NEXT: v_readlane_b32 s51, v16, 7 +; SI-NEXT: v_readlane_b32 s50, v16, 6 +; SI-NEXT: v_readlane_b32 s49, v16, 5 +; SI-NEXT: v_readlane_b32 s48, v16, 4 +; SI-NEXT: v_readlane_b32 s39, v16, 3 +; SI-NEXT: v_readlane_b32 s38, v16, 2 +; SI-NEXT: v_readlane_b32 s37, v16, 1 +; SI-NEXT: v_readlane_b32 s36, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v7i64_scalar: ; VI: ; %bb.0: @@ -12635,87 +12063,38 @@ define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v19, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 @@ -12725,108 +12104,66 @@ define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_alignbit_b32 v14, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v15, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v16, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v17, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v19, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v31 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v19 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v24 +; SI-NEXT: v_or_b32_e32 v3, v3, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v19 +; SI-NEXT: v_or_b32_e32 v7, v7, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f64_to_v28f16: @@ -12912,189 +12249,135 @@ define inreg <28 x half> @bitcast_v7f64_to_v28f16_scalar(<7 x double> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: s_cbranch_scc0 .LBB53_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 -; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: s_lshr_b32 s58, s29, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_lshr_b32 s56, s25, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s45, s19, 16 +; SI-NEXT: s_lshr_b32 s44, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB53_4 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[0:1], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[5:6], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[10:11], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[12:13], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[15:16], s[28:29], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 -; SI-NEXT: v_or_b32_e32 v0, v26, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v2, v24, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 -; SI-NEXT: v_or_b32_e32 v4, v22, v4 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 -; SI-NEXT: v_or_b32_e32 v5, v5, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; SI-NEXT: v_or_b32_e32 v7, v7, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 -; SI-NEXT: v_or_b32_e32 v9, v18, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 -; SI-NEXT: v_or_b32_e32 v6, v21, v6 -; SI-NEXT: v_or_b32_e32 v8, v19, v8 -; SI-NEXT: v_or_b32_e32 v10, v17, v10 -; SI-NEXT: v_or_b32_e32 v12, v15, v12 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_lshr_b64 v[14:15], v[12:13], 16 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_lshr_b64 v[15:16], v[10:11], 16 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_lshr_b64 v[16:17], v[8:9], 16 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[17:18], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[18:19], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; SI-NEXT: s_branch .LBB53_5 +; SI-NEXT: .LBB53_3: +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: s_branch .LBB53_2 +; SI-NEXT: .LBB53_4: +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v22, s58 +; SI-NEXT: v_mov_b32_e32 v23, s57 +; SI-NEXT: v_mov_b32_e32 v24, s56 +; SI-NEXT: v_mov_b32_e32 v25, s47 +; SI-NEXT: v_mov_b32_e32 v26, s46 +; SI-NEXT: v_mov_b32_e32 v27, s45 +; SI-NEXT: v_mov_b32_e32 v28, s44 +; SI-NEXT: v_mov_b32_e32 v20, s40 +; SI-NEXT: v_mov_b32_e32 v19, s14 +; SI-NEXT: v_mov_b32_e32 v18, s12 +; SI-NEXT: v_mov_b32_e32 v17, s10 +; SI-NEXT: v_mov_b32_e32 v16, s8 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: .LBB53_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v2, v2, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v23 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f64_to_v28f16_scalar: ; VI: ; %bb.0: @@ -13227,76 +12510,34 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v28f16_to_v7f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v16, v13 +; SI-NEXT: v_mov_b32_e32 v17, v12 +; SI-NEXT: v_mov_b32_e32 v18, v11 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v9 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v7 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v27, v2 +; SI-NEXT: v_mov_b32_e32 v28, v1 +; SI-NEXT: v_mov_b32_e32 v29, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -13309,48 +12550,48 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB54_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v24, v9 -; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v32 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 @@ -13365,13 +12606,27 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: .LBB54_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -13379,10 +12634,10 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -13390,11 +12645,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v39 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -13402,11 +12657,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -13414,11 +12669,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -13426,11 +12681,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v22 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v21 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -13438,11 +12693,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -13450,11 +12705,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -13462,11 +12717,11 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 @@ -13615,114 +12870,84 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-LABEL: bitcast_v28f16_to_v7f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; SI-NEXT: s_lshr_b32 s8, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 -; SI-NEXT: s_lshr_b32 s8, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: s_lshr_b32 s8, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: s_lshr_b32 s8, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 -; SI-NEXT: s_lshr_b32 s8, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: s_lshr_b32 s8, s17, 16 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: s_lshr_b32 s8, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s36, 0 +; SI-NEXT: v_writelane_b32 v16, s37, 1 +; SI-NEXT: v_writelane_b32 v16, s38, 2 +; SI-NEXT: v_writelane_b32 v16, s39, 3 +; SI-NEXT: v_writelane_b32 v16, s48, 4 +; SI-NEXT: v_writelane_b32 v16, s49, 5 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s56, s19, 16 +; SI-NEXT: s_lshr_b32 s57, s18, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_writelane_b32 v16, s50, 6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: v_writelane_b32 v16, s51, 7 +; SI-NEXT: s_cbranch_scc0 .LBB55_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v30, v6 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: v_or_b32_e32 v9, v23, v9 -; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: v_or_b32_e32 v11, v19, v11 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s36, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_or_b32 s38, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_and_b32 s40, s17, 0xffff +; SI-NEXT: s_lshl_b32 s41, s58, 16 +; SI-NEXT: s_or_b32 s39, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s37, s40, s41 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s7, 16 +; SI-NEXT: s_or_b32 s48, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s49, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB55_4 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -13730,10 +12955,10 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -13741,11 +12966,11 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s56 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 @@ -13753,11 +12978,11 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -13765,11 +12990,11 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -13777,11 +13002,11 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 @@ -13789,11 +13014,11 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -13801,11 +13026,11 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -13813,12 +13038,12 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -13831,11 +13056,41 @@ define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB55_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB55_5 +; SI-NEXT: .LBB55_3: +; SI-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; SI-NEXT: s_branch .LBB55_2 +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v0, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v2, s38 +; SI-NEXT: v_mov_b32_e32 v3, s39 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s42 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_mov_b32_e32 v10, s46 +; SI-NEXT: v_mov_b32_e32 v11, s47 +; SI-NEXT: v_mov_b32_e32 v12, s48 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_mov_b32_e32 v14, s50 +; SI-NEXT: v_mov_b32_e32 v15, s51 +; SI-NEXT: .LBB55_5: ; %end +; SI-NEXT: v_readlane_b32 s51, v16, 7 +; SI-NEXT: v_readlane_b32 s50, v16, 6 +; SI-NEXT: v_readlane_b32 s49, v16, 5 +; SI-NEXT: v_readlane_b32 s48, v16, 4 +; SI-NEXT: v_readlane_b32 s39, v16, 3 +; SI-NEXT: v_readlane_b32 s38, v16, 2 +; SI-NEXT: v_readlane_b32 s37, v16, 1 +; SI-NEXT: v_readlane_b32 s36, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v7f64_scalar: ; VI: ; %bb.0: @@ -14047,6 +13302,20 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v28i16_to_v28f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill @@ -14055,82 +13324,85 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v23 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v22 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v31, v1, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v29, v1, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v49, v0, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v27, v1, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v37, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v25, v1, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v32, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v22, v1, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v30, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v19, v1, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v28, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v14, v1, v47 +; SI-NEXT: v_or_b32_e32 v23, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_alignbit_b32 v33, v31, v50, 16 +; SI-NEXT: v_alignbit_b32 v34, v29, v51, 16 +; SI-NEXT: v_alignbit_b32 v35, v27, v53, 16 +; SI-NEXT: v_alignbit_b32 v36, v25, v55, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v41, 16 +; SI-NEXT: v_alignbit_b32 v39, v19, v43, 16 +; SI-NEXT: v_alignbit_b32 v48, v14, v46, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v46 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -14145,80 +13417,95 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v46 ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v44 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v42 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v40 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v52 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v50 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v12, v46, v12 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v10, v43, v10 +; SI-NEXT: v_or_b32_e32 v8, v41, v8 +; SI-NEXT: v_or_b32_e32 v6, v55, v6 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 +; SI-NEXT: v_or_b32_e32 v10, v45, v10 +; SI-NEXT: v_or_b32_e32 v8, v44, v8 +; SI-NEXT: v_or_b32_e32 v6, v42, v6 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v33, v31, v49, 16 +; SI-NEXT: v_alignbit_b32 v34, v29, v37, 16 +; SI-NEXT: v_alignbit_b32 v35, v27, v32, 16 +; SI-NEXT: v_alignbit_b32 v36, v25, v30, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v28, 16 +; SI-NEXT: v_alignbit_b32 v39, v19, v23, 16 +; SI-NEXT: v_alignbit_b32 v48, v14, v20, 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload @@ -14229,62 +13516,48 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v29 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v20 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v32 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v23 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v26 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v33 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v38 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v48 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -14419,197 +13692,239 @@ define inreg <28 x half> @bitcast_v28i16_to_v28f16_scalar(<28 x i16> inreg %a, i ; SI-LABEL: bitcast_v28i16_to_v28f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s88, s29, 16 +; SI-NEXT: s_lshr_b32 s95, s28, 16 +; SI-NEXT: s_lshr_b32 s79, s27, 16 +; SI-NEXT: s_lshr_b32 s94, s26, 16 +; SI-NEXT: s_lshr_b32 s78, s25, 16 +; SI-NEXT: s_lshr_b32 s93, s24, 16 +; SI-NEXT: s_lshr_b32 s77, s23, 16 +; SI-NEXT: s_lshr_b32 s92, s22, 16 +; SI-NEXT: s_lshr_b32 s76, s21, 16 +; SI-NEXT: s_lshr_b32 s91, s20, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s90, s18, 16 +; SI-NEXT: s_lshr_b32 s74, s17, 16 +; SI-NEXT: s_lshr_b32 s89, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s74, 16 +; SI-NEXT: s_or_b32 s15, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s75, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s76, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s14, s89, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s77, 16 +; SI-NEXT: s_or_b32 s12, s4, s14 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s40, s90, 16 +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s78, 16 +; SI-NEXT: s_or_b32 s10, s4, s40 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s46, s91, 16 +; SI-NEXT: s_or_b32 s59, s5, s7 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s7, s79, 16 +; SI-NEXT: s_or_b32 s8, s4, s46 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s56, s92, 16 +; SI-NEXT: s_lshl_b32 s60, s94, 16 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_and_b32 s5, s29, 0xffff +; SI-NEXT: s_lshl_b32 s7, s88, 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[14:15], 16 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_or_b32 s6, s4, s56 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s58, s93, 16 +; SI-NEXT: s_or_b32 s63, s5, s7 +; SI-NEXT: s_lshl_b32 s62, s95, 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[40:41], 16 +; SI-NEXT: s_or_b32 s40, s14, s60 +; SI-NEXT: s_and_b32 s14, s28, 0xffff +; SI-NEXT: s_or_b32 s4, s4, s58 +; SI-NEXT: s_mov_b32 s13, s15 +; SI-NEXT: s_mov_b32 s11, s41 +; SI-NEXT: s_mov_b32 s9, s47 +; SI-NEXT: s_lshr_b64 s[46:47], s[46:47], 16 +; SI-NEXT: s_mov_b32 s7, s57 +; SI-NEXT: s_lshr_b64 s[56:57], s[56:57], 16 +; SI-NEXT: s_mov_b32 s5, s59 +; SI-NEXT: s_lshr_b64 s[58:59], s[58:59], 16 +; SI-NEXT: s_mov_b32 s41, s61 +; SI-NEXT: s_lshr_b64 s[60:61], s[60:61], 16 +; SI-NEXT: s_or_b32 s14, s14, s62 +; SI-NEXT: s_mov_b32 s15, s63 +; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s95, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s14, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s88, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s15, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s94, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s79, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s93, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s6, s78, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s22, 0xffff +; SI-NEXT: s_lshl_b32 s7, s92, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s23, 0xffff +; SI-NEXT: s_lshl_b32 s8, s77, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s91, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s76, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s90, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s12, s75, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s16, 0xffff +; SI-NEXT: s_lshl_b32 s13, s89, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s74, 16 +; SI-NEXT: s_or_b32 s13, s16, s13 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_lshr_b64 s[42:43], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; SI-NEXT: s_lshr_b32 s74, s13, 16 +; SI-NEXT: s_lshr_b32 s75, s11, 16 +; SI-NEXT: s_lshr_b32 s76, s9, 16 +; SI-NEXT: s_lshr_b32 s77, s7, 16 +; SI-NEXT: s_lshr_b32 s78, s5, 16 +; SI-NEXT: s_lshr_b32 s79, s41, 16 +; SI-NEXT: s_lshr_b32 s88, s15, 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v0, v0, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 -; SI-NEXT: v_or_b32_e32 v1, v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v2, v2, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v23 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v16 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v25 -; SI-NEXT: v_or_b32_e32 v11, v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s16, s42, 16 +; SI-NEXT: s_or_b32 s12, s12, s16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s16, s74, 16 +; SI-NEXT: s_or_b32 s13, s13, s16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s16, s44, 16 +; SI-NEXT: s_or_b32 s10, s10, s16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s16, s75, 16 +; SI-NEXT: s_or_b32 s11, s11, s16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s16, s46, 16 +; SI-NEXT: s_or_b32 s8, s8, s16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s16, s76, 16 +; SI-NEXT: s_or_b32 s9, s9, s16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s16, s56, 16 +; SI-NEXT: s_or_b32 s6, s6, s16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s16, s77, 16 +; SI-NEXT: s_or_b32 s7, s7, s16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s16, s58, 16 +; SI-NEXT: s_or_b32 s4, s4, s16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s16, s78, 16 +; SI-NEXT: s_or_b32 s5, s5, s16 +; SI-NEXT: s_and_b32 s16, s40, 0xffff +; SI-NEXT: s_lshl_b32 s17, s60, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s41, 0xffff +; SI-NEXT: s_lshl_b32 s18, s79, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s18, s62, 16 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s18, s88, 16 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s5 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v12, s14 +; SI-NEXT: v_mov_b32_e32 v13, s15 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v28i16_to_v28f16_scalar: @@ -14818,243 +14133,187 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v15 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_or_b32_e32 v13, v13, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_or_b32_e32 v11, v11, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_or_b32_e32 v9, v9, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_or_b32_e32 v7, v7, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v5, v5, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 ; SI-NEXT: v_or_b32_e32 v3, v3, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v23 ; SI-NEXT: v_or_b32_e32 v1, v1, v28 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v0, v0, v27 -; SI-NEXT: v_or_b32_e32 v21, v21, v26 -; SI-NEXT: v_or_b32_e32 v19, v19, v25 -; SI-NEXT: v_or_b32_e32 v18, v18, v24 -; SI-NEXT: v_or_b32_e32 v16, v16, v23 -; SI-NEXT: v_or_b32_e32 v17, v17, v22 -; SI-NEXT: v_or_b32_e32 v15, v15, v20 +; SI-NEXT: v_or_b32_e32 v2, v2, v26 +; SI-NEXT: v_or_b32_e32 v4, v4, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v22 +; SI-NEXT: v_or_b32_e32 v10, v10, v20 +; SI-NEXT: v_or_b32_e32 v12, v12, v18 ; SI-NEXT: v_alignbit_b32 v27, v1, v27, 16 ; SI-NEXT: v_alignbit_b32 v26, v3, v26, 16 ; SI-NEXT: v_alignbit_b32 v25, v5, v25, 16 ; SI-NEXT: v_alignbit_b32 v24, v7, v24, 16 -; SI-NEXT: v_alignbit_b32 v23, v9, v23, 16 -; SI-NEXT: v_alignbit_b32 v22, v11, v22, 16 -; SI-NEXT: v_alignbit_b32 v20, v13, v20, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v22, 16 +; SI-NEXT: v_alignbit_b32 v20, v11, v20, 16 +; SI-NEXT: v_alignbit_b32 v18, v13, v18, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v24 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; SI-NEXT: v_or_b32_e32 v8, v8, v16 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v26 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_or_b32_e32 v0, v0, v27 -; SI-NEXT: v_or_b32_e32 v2, v2, v21 -; SI-NEXT: v_or_b32_e32 v4, v4, v19 -; SI-NEXT: v_or_b32_e32 v6, v6, v18 -; SI-NEXT: v_or_b32_e32 v10, v10, v16 -; SI-NEXT: v_or_b32_e32 v12, v12, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v23 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -15190,178 +14449,124 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-LABEL: bitcast_v28f16_to_v28i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s24 +; SI-NEXT: s_lshr_b32 s15, s29, 16 +; SI-NEXT: s_lshr_b32 s12, s28, 16 +; SI-NEXT: s_lshr_b32 s10, s27, 16 +; SI-NEXT: s_lshr_b32 s13, s26, 16 +; SI-NEXT: s_lshr_b32 s11, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s18 -; SI-NEXT: s_lshr_b32 s6, s22, 16 -; SI-NEXT: s_lshr_b32 s7, s20, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s9, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: s_cbranch_scc0 .LBB59_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: s_cbranch_execnz .LBB59_4 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s40 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v4 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s11 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_or_b32_e32 v9, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 +; SI-NEXT: v_or_b32_e32 v7, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s21 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s19 +; SI-NEXT: v_or_b32_e32 v5, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s17 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v16 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_or_b32_e32 v3, v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s29 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_or_b32_e32 v11, v11, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s18 +; SI-NEXT: v_or_b32_e32 v13, v13, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v31 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 ; SI-NEXT: v_or_b32_e32 v49, v14, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 -; SI-NEXT: v_or_b32_e32 v48, v15, v2 -; SI-NEXT: v_or_b32_e32 v38, v16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 +; SI-NEXT: v_or_b32_e32 v39, v15, v2 +; SI-NEXT: v_or_b32_e32 v37, v16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -15370,14 +14575,12 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v39, v14, v6 -; SI-NEXT: v_or_b32_e32 v36, v15, v8 -; SI-NEXT: v_or_b32_e32 v34, v16, v10 -; SI-NEXT: v_or_b32_e32 v32, v17, v12 +; SI-NEXT: v_or_b32_e32 v48, v14, v6 +; SI-NEXT: v_or_b32_e32 v38, v15, v8 +; SI-NEXT: v_or_b32_e32 v36, v16, v10 +; SI-NEXT: v_or_b32_e32 v35, v17, v12 ; SI-NEXT: v_lshr_b64 v[26:27], v[0:1], 16 ; SI-NEXT: v_lshr_b64 v[24:25], v[2:3], 16 ; SI-NEXT: v_lshr_b64 v[22:23], v[4:5], 16 @@ -15385,52 +14588,82 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_lshr_b64 v[18:19], v[8:9], 16 ; SI-NEXT: v_lshr_b64 v[16:17], v[10:11], 16 ; SI-NEXT: v_lshr_b64 v[14:15], v[12:13], 16 -; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: s_branch .LBB59_5 +; SI-NEXT: .LBB59_3: +; SI-NEXT: s_branch .LBB59_2 +; SI-NEXT: .LBB59_4: +; SI-NEXT: v_mov_b32_e32 v28, s15 +; SI-NEXT: v_mov_b32_e32 v33, s10 +; SI-NEXT: v_mov_b32_e32 v29, s11 +; SI-NEXT: v_mov_b32_e32 v30, s9 +; SI-NEXT: v_mov_b32_e32 v31, s8 +; SI-NEXT: v_mov_b32_e32 v32, s7 +; SI-NEXT: v_mov_b32_e32 v34, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v35, s28 +; SI-NEXT: v_mov_b32_e32 v36, s26 +; SI-NEXT: v_mov_b32_e32 v38, s24 +; SI-NEXT: v_mov_b32_e32 v48, s22 +; SI-NEXT: v_mov_b32_e32 v37, s20 +; SI-NEXT: v_mov_b32_e32 v39, s18 +; SI-NEXT: v_mov_b32_e32 v49, s16 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v26, s43 +; SI-NEXT: v_mov_b32_e32 v24, s42 +; SI-NEXT: v_mov_b32_e32 v22, s41 +; SI-NEXT: v_mov_b32_e32 v20, s40 +; SI-NEXT: v_mov_b32_e32 v18, s14 +; SI-NEXT: v_mov_b32_e32 v16, s13 +; SI-NEXT: v_mov_b32_e32 v14, s12 +; SI-NEXT: .LBB59_5: ; %end ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v39 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v32 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v37 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v48 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v30 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v38 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v18 ; SI-NEXT: v_or_b32_e32 v8, v8, v10 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v29 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v36 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 ; SI-NEXT: v_or_b32_e32 v10, v10, v12 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v35 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v12, v12, v14 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v28f16_to_v28i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll index 1bcc09a680b2a..547985e7ef4e3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll @@ -11,50 +11,45 @@ define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB0_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: .LBB0_2: ; %Flow +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB0_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v2, v1, v2, 16 ; SI-NEXT: .LBB0_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -255,42 +250,38 @@ define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i ; SI-NEXT: s_and_b32 s5, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s6 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s6 ; SI-NEXT: s_cbranch_scc0 .LBB1_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_lshr_b64 v[6:7], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[4:5], 16 ; SI-NEXT: s_cbranch_execnz .LBB1_3 ; SI-NEXT: .LBB1_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[6:7], v[0:1], 16 ; SI-NEXT: .LBB1_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB1_4: +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_branch .LBB1_2 ; ; VI-LABEL: bitcast_v3bf16_to_v3f16_scalar: @@ -513,49 +504,43 @@ define <3 x bfloat> @bitcast_v3f16_to_v3bf16(<3 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v3f16_to_v3bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_alignbit_b32 v5, v1, v0, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB2_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; SI-NEXT: .LBB2_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -627,46 +612,45 @@ define inreg <3 x bfloat> @bitcast_v3f16_to_v3bf16_scalar(<3 x half> inreg %a, i ; SI-LABEL: bitcast_v3f16_to_v3bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 -; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: s_lshl_b32 s5, s16, 16 +; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: s_lshl_b32 s9, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB3_4 ; SI-NEXT: .LBB3_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; SI-NEXT: .LBB3_3: ; %end -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: s_branch .LBB3_5 +; SI-NEXT: .LBB3_3: +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: .LBB3_5: ; %end ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 -; SI-NEXT: v_lshr_b64 v[0:1], v[1:2], 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB3_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: s_branch .LBB3_2 ; ; VI-LABEL: bitcast_v3f16_to_v3bf16_scalar: ; VI: ; %bb.0: @@ -1435,14 +1419,8 @@ define <3 x i16> @bitcast_v3f16_to_v3i16(<3 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v3f16_to_v3i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_alignbit_b32 v3, v1, v0, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -1535,37 +1513,36 @@ define inreg <3 x i16> @bitcast_v3f16_to_v3i16_scalar(<3 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v3f16_to_v3i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: s_cbranch_scc0 .LBB9_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: s_cbranch_execnz .LBB9_4 ; SI-NEXT: .LBB9_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 ; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_branch .LBB9_5 +; SI-NEXT: .LBB9_3: +; SI-NEXT: s_branch .LBB9_2 +; SI-NEXT: .LBB9_4: +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: .LBB9_5: ; %end ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB9_4: -; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_v3f16_to_v3i16_scalar: ; VI: ; %bb.0: @@ -1649,38 +1626,38 @@ define <3 x half> @bitcast_v3i16_to_v3f16(<3 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v3i16_to_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v5, v1, v0, 16 +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_alignbit_b32 v1, v3, v0, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v4, v3, v5, 16 +; SI-NEXT: v_or_b32_e32 v2, v0, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: .LBB10_2: ; %Flow +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB10_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: .LBB10_4: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v4, v0, v2, 16 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i16_to_v3f16: @@ -1749,32 +1726,37 @@ define inreg <3 x half> @bitcast_v3i16_to_v3f16_scalar(<3 x i16> inreg %a, i32 i ; SI-LABEL: bitcast_v3i16_to_v3f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s8, s6, 16 +; SI-NEXT: s_mov_b32 s9, s17 +; SI-NEXT: s_or_b32 s4, s4, s8 +; SI-NEXT: s_and_b32 s7, s17, 0xffff +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 ; SI-NEXT: s_cbranch_execnz .LBB11_3 ; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: s_add_i32 s4, s16, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s17, 3 -; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: s_add_i32 s6, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: s_and_b32 s7, s5, 0xffff +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 ; SI-NEXT: .LBB11_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB11_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB11_2 ; ; VI-LABEL: bitcast_v3i16_to_v3f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 0625121f9ea7a..fd2fec386b6bf 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -3270,240 +3270,128 @@ define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v27, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v27, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v25 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v26 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v19 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v12, v12, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v18 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i32_to_v32f16: @@ -3629,77 +3517,43 @@ define inreg <32 x half> @bitcast_v16i32_to_v32f16_scalar(<16 x i32> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v15, s28 ; SI-NEXT: v_mov_b32_e32 v16, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_readfirstlane_b32 s21, v3 -; SI-NEXT: v_readfirstlane_b32 s20, v4 -; SI-NEXT: v_readfirstlane_b32 s19, v5 -; SI-NEXT: v_readfirstlane_b32 s18, v6 -; SI-NEXT: v_readfirstlane_b32 s17, v7 -; SI-NEXT: v_readfirstlane_b32 s16, v8 -; SI-NEXT: v_readfirstlane_b32 s15, v9 -; SI-NEXT: v_readfirstlane_b32 s14, v10 -; SI-NEXT: v_readfirstlane_b32 s13, v11 -; SI-NEXT: v_readfirstlane_b32 s12, v12 -; SI-NEXT: v_readfirstlane_b32 s11, v13 -; SI-NEXT: v_readfirstlane_b32 s10, v14 -; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: v_readfirstlane_b32 s18, v3 +; SI-NEXT: v_readfirstlane_b32 s19, v4 +; SI-NEXT: v_readfirstlane_b32 s16, v5 +; SI-NEXT: v_readfirstlane_b32 s17, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 ; SI-NEXT: v_readfirstlane_b32 s7, v16 -; SI-NEXT: v_readfirstlane_b32 s6, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: v_readfirstlane_b32 s4, v0 +; SI-NEXT: s_and_b64 s[20:21], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v1 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: s_lshr_b32 s56, s5, 16 +; SI-NEXT: s_lshr_b32 s57, s7, 16 +; SI-NEXT: s_lshr_b32 s58, s9, 16 +; SI-NEXT: s_lshr_b32 s59, s11, 16 +; SI-NEXT: s_lshr_b32 s60, s13, 16 +; SI-NEXT: s_lshr_b32 s61, s15, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s19, 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 @@ -3710,157 +3564,111 @@ define inreg <32 x half> @bitcast_v16i32_to_v32f16_scalar(<16 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: s_lshr_b32 s5, s20, 16 -; SI-NEXT: s_lshr_b32 s22, s19, 16 -; SI-NEXT: s_lshr_b32 s23, s18, 16 -; SI-NEXT: s_lshr_b32 s24, s17, 16 -; SI-NEXT: s_lshr_b32 s25, s16, 16 -; SI-NEXT: s_lshr_b32 s26, s15, 16 -; SI-NEXT: s_lshr_b32 s27, s14, 16 -; SI-NEXT: s_lshr_b32 s28, s13, 16 -; SI-NEXT: s_lshr_b32 s29, s12, 16 -; SI-NEXT: s_lshr_b32 s40, s11, 16 -; SI-NEXT: s_lshr_b32 s41, s10, 16 -; SI-NEXT: s_lshr_b32 s42, s8, 16 -; SI-NEXT: s_lshr_b32 s43, s7, 16 -; SI-NEXT: s_lshr_b32 s44, s6, 16 -; SI-NEXT: s_lshr_b32 s45, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 +; SI-NEXT: s_lshr_b32 s56, s5, 16 +; SI-NEXT: s_lshr_b32 s57, s7, 16 +; SI-NEXT: s_lshr_b32 s58, s9, 16 +; SI-NEXT: s_lshr_b32 s59, s11, 16 +; SI-NEXT: s_lshr_b32 s60, s13, 16 +; SI-NEXT: s_lshr_b32 s61, s15, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s19, 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_or_b32_e32 v0, v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v2, v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v28 -; SI-NEXT: v_or_b32_e32 v5, v5, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; SI-NEXT: v_or_b32_e32 v7, v7, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 -; SI-NEXT: v_or_b32_e32 v9, v22, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v22 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v13, v18, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v3, v30, v3 -; SI-NEXT: v_or_b32_e32 v4, v27, v4 -; SI-NEXT: v_or_b32_e32 v6, v25, v6 -; SI-NEXT: v_or_b32_e32 v8, v23, v8 -; SI-NEXT: v_or_b32_e32 v10, v21, v10 -; SI-NEXT: v_or_b32_e32 v12, v19, v12 -; SI-NEXT: v_or_b32_e32 v14, v17, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s21, s44, 16 +; SI-NEXT: s_or_b32 s18, s18, s21 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s21, s63, 16 +; SI-NEXT: s_or_b32 s19, s19, s21 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s21, s42, 16 +; SI-NEXT: s_or_b32 s16, s16, s21 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s21, s62, 16 +; SI-NEXT: s_or_b32 s17, s17, s21 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s21, s40, 16 +; SI-NEXT: s_or_b32 s14, s14, s21 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s21, s61, 16 +; SI-NEXT: s_or_b32 s15, s15, s21 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s21, s28, 16 +; SI-NEXT: s_or_b32 s12, s12, s21 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s21, s60, 16 +; SI-NEXT: s_or_b32 s13, s13, s21 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s21, s26, 16 +; SI-NEXT: s_or_b32 s10, s10, s21 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s21, s59, 16 +; SI-NEXT: s_or_b32 s11, s11, s21 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s21, s24, 16 +; SI-NEXT: s_or_b32 s8, s8, s21 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s21, s58, 16 +; SI-NEXT: s_or_b32 s9, s9, s21 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s21, s22, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_or_b32 s6, s6, s21 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s21, s57, 16 +; SI-NEXT: s_or_b32 s4, s4, s20 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s20, s56, 16 +; SI-NEXT: s_or_b32 s7, s7, s21 +; SI-NEXT: s_or_b32 s5, s5, s20 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_mov_b32_e32 v1, s19 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s14 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s10 +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: v_mov_b32_e32 v13, s7 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: v_mov_b32_e32 v15, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v16i32_to_v32f16_scalar: @@ -4017,182 +3825,150 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v16i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v17, v14 +; SI-NEXT: v_mov_b32_e32 v18, v13 +; SI-NEXT: v_mov_b32_e32 v19, v12 +; SI-NEXT: v_mov_b32_e32 v20, v11 +; SI-NEXT: v_mov_b32_e32 v21, v10 +; SI-NEXT: v_mov_b32_e32 v22, v9 +; SI-NEXT: v_mov_b32_e32 v23, v8 +; SI-NEXT: v_mov_b32_e32 v24, v7 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v26, v5 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v30, v1 +; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v31 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v2, v51, v2 -; SI-NEXT: v_or_b32_e32 v3, v49, v3 -; SI-NEXT: v_or_b32_e32 v4, v39, v4 -; SI-NEXT: v_or_b32_e32 v5, v37, v5 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 -; SI-NEXT: v_or_b32_e32 v7, v33, v7 -; SI-NEXT: v_or_b32_e32 v8, v31, v8 -; SI-NEXT: v_or_b32_e32 v9, v29, v9 -; SI-NEXT: v_or_b32_e32 v10, v27, v10 -; SI-NEXT: v_or_b32_e32 v11, v25, v11 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: .LBB18_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB18_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -4200,10 +3976,10 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -4211,11 +3987,11 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -4223,11 +3999,11 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -4235,11 +4011,11 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v39 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -4247,11 +4023,11 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v22 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -4259,11 +4035,11 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -4271,11 +4047,11 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -4283,12 +4059,12 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -4449,248 +4225,216 @@ define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i ; SI-LABEL: bitcast_v32f16_to_v16i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: s_lshr_b32 s10, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: s_lshr_b32 s10, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 -; SI-NEXT: s_lshr_b32 s10, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: s_lshr_b32 s10, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v50, v2 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 -; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v7, v31, v7 -; SI-NEXT: v_or_b32_e32 v8, v30, v8 -; SI-NEXT: v_or_b32_e32 v9, v27, v9 -; SI-NEXT: v_or_b32_e32 v10, v26, v10 -; SI-NEXT: v_or_b32_e32 v11, v23, v11 -; SI-NEXT: v_or_b32_e32 v12, v22, v12 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v27 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 @@ -18772,240 +18516,128 @@ define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v27, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v27, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v25 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v26 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v19 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v12, v12, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v18 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v32f16: @@ -19109,236 +18741,142 @@ define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mov_b32_e32 v55, s16 -; SI-NEXT: v_mov_b32_e32 v54, s17 -; SI-NEXT: v_mov_b32_e32 v53, s18 -; SI-NEXT: v_mov_b32_e32 v52, s19 -; SI-NEXT: v_mov_b32_e32 v51, s20 -; SI-NEXT: v_mov_b32_e32 v50, s21 -; SI-NEXT: v_mov_b32_e32 v49, s22 -; SI-NEXT: v_mov_b32_e32 v48, s23 -; SI-NEXT: v_mov_b32_e32 v39, s24 -; SI-NEXT: v_mov_b32_e32 v38, s25 -; SI-NEXT: v_mov_b32_e32 v36, s26 -; SI-NEXT: v_mov_b32_e32 v35, s27 -; SI-NEXT: v_mov_b32_e32 v33, s28 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v37, s29 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v55 +; SI-NEXT: v_lshr_b64 v[18:19], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[4:5], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v3, 1.0, v55 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v54 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v53 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v52 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v51 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v50 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v48 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v39 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v38 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v36 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v35 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v33 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v37 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_lshr_b64 v[18:19], v[0:1], 16 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[19:20], v[12:13], 16 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[20:21], v[10:11], 16 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[21:22], v[8:9], 16 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_lshr_b64 v[23:24], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v2, v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v28 -; SI-NEXT: v_or_b32_e32 v5, v5, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; SI-NEXT: v_or_b32_e32 v7, v7, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 -; SI-NEXT: v_or_b32_e32 v9, v22, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v22 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_or_b32_e32 v13, v18, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; SI-NEXT: v_or_b32_e32 v3, v30, v3 -; SI-NEXT: v_or_b32_e32 v4, v27, v4 -; SI-NEXT: v_or_b32_e32 v6, v25, v6 -; SI-NEXT: v_or_b32_e32 v8, v23, v8 -; SI-NEXT: v_or_b32_e32 v10, v21, v10 -; SI-NEXT: v_or_b32_e32 v12, v19, v12 -; SI-NEXT: v_or_b32_e32 v14, v17, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_or_b32_e32 v17, v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v16 +; SI-NEXT: v_mov_b32_e32 v1, v17 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v16f32_to_v32f16_scalar: @@ -19499,126 +19037,110 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v16f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v17, v14 +; SI-NEXT: v_mov_b32_e32 v18, v13 +; SI-NEXT: v_mov_b32_e32 v19, v12 +; SI-NEXT: v_mov_b32_e32 v20, v11 +; SI-NEXT: v_mov_b32_e32 v21, v10 +; SI-NEXT: v_mov_b32_e32 v22, v9 +; SI-NEXT: v_mov_b32_e32 v23, v8 +; SI-NEXT: v_mov_b32_e32 v24, v7 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v26, v5 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v30, v1 +; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v31 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v2, v51, v2 -; SI-NEXT: v_or_b32_e32 v3, v49, v3 -; SI-NEXT: v_or_b32_e32 v4, v39, v4 -; SI-NEXT: v_or_b32_e32 v5, v37, v5 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 -; SI-NEXT: v_or_b32_e32 v7, v33, v7 -; SI-NEXT: v_or_b32_e32 v8, v31, v8 -; SI-NEXT: v_or_b32_e32 v9, v29, v9 -; SI-NEXT: v_or_b32_e32 v10, v27, v10 -; SI-NEXT: v_or_b32_e32 v11, v25, v11 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 @@ -19635,30 +19157,14 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: .LBB42_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -19671,10 +19177,10 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -19682,10 +19188,10 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -19693,11 +19199,11 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -19705,11 +19211,11 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -19717,11 +19223,11 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v39 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -19729,11 +19235,11 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v22 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -19741,11 +19247,11 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -19753,11 +19259,11 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -19765,12 +19271,12 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -19931,248 +19437,216 @@ define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, ; SI-LABEL: bitcast_v32f16_to_v16f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: s_lshr_b32 s10, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: s_lshr_b32 s10, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 -; SI-NEXT: s_lshr_b32 s10, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: s_lshr_b32 s10, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v50, v2 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 -; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v7, v31, v7 -; SI-NEXT: v_or_b32_e32 v8, v30, v8 -; SI-NEXT: v_or_b32_e32 v9, v27, v9 -; SI-NEXT: v_or_b32_e32 v10, v26, v10 -; SI-NEXT: v_or_b32_e32 v11, v23, v11 -; SI-NEXT: v_or_b32_e32 v12, v22, v12 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v27 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 @@ -33786,106 +33260,42 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB60_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v21, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v27, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB60_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB60_4 @@ -33906,120 +33316,72 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v21, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v27, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB60_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v25 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v26 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_or_b32_e32 v4, v4, v21 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v19 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v12, v12, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v21 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v18 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v32f16: @@ -34149,238 +33511,158 @@ define inreg <32 x half> @bitcast_v8i64_to_v32f16_scalar(<8 x i64> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v15, s28 ; SI-NEXT: v_mov_b32_e32 v16, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_readfirstlane_b32 s20, v3 -; SI-NEXT: v_readfirstlane_b32 s21, v4 -; SI-NEXT: v_readfirstlane_b32 s18, v5 -; SI-NEXT: v_readfirstlane_b32 s19, v6 -; SI-NEXT: v_readfirstlane_b32 s16, v7 -; SI-NEXT: v_readfirstlane_b32 s17, v8 -; SI-NEXT: v_readfirstlane_b32 s14, v9 -; SI-NEXT: v_readfirstlane_b32 s15, v10 -; SI-NEXT: v_readfirstlane_b32 s12, v11 -; SI-NEXT: v_readfirstlane_b32 s13, v12 -; SI-NEXT: v_readfirstlane_b32 s10, v13 -; SI-NEXT: v_readfirstlane_b32 s11, v14 -; SI-NEXT: v_readfirstlane_b32 s7, v15 -; SI-NEXT: v_readfirstlane_b32 s8, v16 -; SI-NEXT: v_readfirstlane_b32 s6, v0 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: v_readfirstlane_b32 s18, v3 +; SI-NEXT: v_readfirstlane_b32 s19, v4 +; SI-NEXT: v_readfirstlane_b32 s16, v5 +; SI-NEXT: v_readfirstlane_b32 s17, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s4, v0 +; SI-NEXT: s_and_b64 s[20:21], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v1 ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 +; SI-NEXT: s_lshr_b32 s56, s5, 16 +; SI-NEXT: s_lshr_b32 s57, s7, 16 +; SI-NEXT: s_lshr_b32 s58, s9, 16 +; SI-NEXT: s_lshr_b32 s59, s11, 16 +; SI-NEXT: s_lshr_b32 s60, s13, 16 +; SI-NEXT: s_lshr_b32 s61, s15, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s19, 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s20, 3 -; SI-NEXT: s_addc_u32 s5, s21, 0 -; SI-NEXT: s_lshr_b32 s20, s4, 16 -; SI-NEXT: s_lshr_b32 s21, s5, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s22, s18, 16 -; SI-NEXT: s_lshr_b32 s23, s19, 16 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s24, s16, 16 -; SI-NEXT: s_lshr_b32 s25, s17, 16 -; SI-NEXT: s_add_u32 s14, s14, 3 -; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_lshr_b32 s26, s14, 16 -; SI-NEXT: s_lshr_b32 s27, s15, 16 -; SI-NEXT: s_add_u32 s12, s12, 3 -; SI-NEXT: s_addc_u32 s13, s13, 0 -; SI-NEXT: s_lshr_b32 s28, s12, 16 -; SI-NEXT: s_lshr_b32 s29, s13, 16 -; SI-NEXT: s_add_u32 s10, s10, 3 -; SI-NEXT: s_addc_u32 s11, s11, 0 -; SI-NEXT: s_lshr_b32 s40, s10, 16 -; SI-NEXT: s_lshr_b32 s41, s11, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_lshr_b32 s42, s7, 16 -; SI-NEXT: s_lshr_b32 s43, s8, 16 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_lshr_b32 s44, s6, 16 -; SI-NEXT: s_lshr_b32 s45, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s56, s5, 16 +; SI-NEXT: s_lshr_b32 s57, s7, 16 +; SI-NEXT: s_lshr_b32 s58, s9, 16 +; SI-NEXT: s_lshr_b32 s59, s11, 16 +; SI-NEXT: s_lshr_b32 s60, s13, 16 +; SI-NEXT: s_lshr_b32 s61, s15, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s19, 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 ; SI-NEXT: .LBB61_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_or_b32_e32 v0, v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_or_b32_e32 v2, v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v28 -; SI-NEXT: v_or_b32_e32 v5, v5, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; SI-NEXT: v_or_b32_e32 v7, v7, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 -; SI-NEXT: v_or_b32_e32 v9, v22, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v22 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v13, v18, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v3, v30, v3 -; SI-NEXT: v_or_b32_e32 v4, v27, v4 -; SI-NEXT: v_or_b32_e32 v6, v25, v6 -; SI-NEXT: v_or_b32_e32 v8, v23, v8 -; SI-NEXT: v_or_b32_e32 v10, v21, v10 -; SI-NEXT: v_or_b32_e32 v12, v19, v12 -; SI-NEXT: v_or_b32_e32 v14, v17, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s21, s44, 16 +; SI-NEXT: s_or_b32 s18, s18, s21 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s21, s63, 16 +; SI-NEXT: s_or_b32 s19, s19, s21 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s21, s42, 16 +; SI-NEXT: s_or_b32 s16, s16, s21 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s21, s62, 16 +; SI-NEXT: s_or_b32 s17, s17, s21 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s21, s40, 16 +; SI-NEXT: s_or_b32 s14, s14, s21 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s21, s61, 16 +; SI-NEXT: s_or_b32 s15, s15, s21 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s21, s28, 16 +; SI-NEXT: s_or_b32 s12, s12, s21 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s21, s60, 16 +; SI-NEXT: s_or_b32 s13, s13, s21 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s21, s26, 16 +; SI-NEXT: s_or_b32 s10, s10, s21 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s21, s59, 16 +; SI-NEXT: s_or_b32 s11, s11, s21 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s21, s24, 16 +; SI-NEXT: s_or_b32 s8, s8, s21 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s21, s58, 16 +; SI-NEXT: s_or_b32 s9, s9, s21 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s21, s22, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_or_b32 s6, s6, s21 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s21, s57, 16 +; SI-NEXT: s_or_b32 s4, s4, s20 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s20, s56, 16 +; SI-NEXT: s_or_b32 s7, s7, s21 +; SI-NEXT: s_or_b32 s5, s5, s20 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_mov_b32_e32 v1, s19 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: v_mov_b32_e32 v4, s14 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s10 +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: v_mov_b32_e32 v13, s7 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: v_mov_b32_e32 v15, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB61_4: -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: s_branch .LBB61_2 ; ; VI-LABEL: bitcast_v8i64_to_v32f16_scalar: @@ -34537,126 +33819,110 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v8i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v17, v14 +; SI-NEXT: v_mov_b32_e32 v18, v13 +; SI-NEXT: v_mov_b32_e32 v19, v12 +; SI-NEXT: v_mov_b32_e32 v20, v11 +; SI-NEXT: v_mov_b32_e32 v21, v10 +; SI-NEXT: v_mov_b32_e32 v22, v9 +; SI-NEXT: v_mov_b32_e32 v23, v8 +; SI-NEXT: v_mov_b32_e32 v24, v7 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v26, v5 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v30, v1 +; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v31 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v2, v51, v2 -; SI-NEXT: v_or_b32_e32 v3, v49, v3 -; SI-NEXT: v_or_b32_e32 v4, v39, v4 -; SI-NEXT: v_or_b32_e32 v5, v37, v5 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 -; SI-NEXT: v_or_b32_e32 v7, v33, v7 -; SI-NEXT: v_or_b32_e32 v8, v31, v8 -; SI-NEXT: v_or_b32_e32 v9, v29, v9 -; SI-NEXT: v_or_b32_e32 v10, v27, v10 -; SI-NEXT: v_or_b32_e32 v11, v25, v11 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 @@ -34673,30 +33939,14 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: .LBB62_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -34709,10 +33959,10 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -34720,10 +33970,10 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -34731,11 +33981,11 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -34743,11 +33993,11 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -34755,11 +34005,11 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v39 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -34767,11 +34017,11 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v22 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -34779,11 +34029,11 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -34791,11 +34041,11 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -34803,12 +34053,12 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -34969,248 +34219,216 @@ define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 ; SI-LABEL: bitcast_v32f16_to_v8i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: s_lshr_b32 s10, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: s_lshr_b32 s10, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 -; SI-NEXT: s_lshr_b32 s10, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: s_lshr_b32 s10, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 ; SI-NEXT: s_cbranch_scc0 .LBB63_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v50, v2 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 -; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v7, v31, v7 -; SI-NEXT: v_or_b32_e32 v8, v30, v8 -; SI-NEXT: v_or_b32_e32 v9, v27, v9 -; SI-NEXT: v_or_b32_e32 v10, v26, v10 -; SI-NEXT: v_or_b32_e32 v11, v23, v11 -; SI-NEXT: v_or_b32_e32 v12, v22, v12 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB63_3 ; SI-NEXT: .LBB63_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v27 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 @@ -47986,98 +47204,42 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v25, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v28, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB76_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_4 @@ -48088,122 +47250,74 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_alignbit_b32 v16, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v18, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v25, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v28, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: .LBB76_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v25 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v26 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v25 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v30 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v8, v8, v19 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_or_b32_e32 v12, v12, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v28 +; SI-NEXT: v_or_b32_e32 v3, v3, v25 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v18 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v32f16: @@ -48291,228 +47405,134 @@ define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mov_b32_e32 v15, s16 -; SI-NEXT: v_mov_b32_e32 v16, s17 -; SI-NEXT: v_mov_b32_e32 v13, s18 -; SI-NEXT: v_mov_b32_e32 v14, s19 -; SI-NEXT: v_mov_b32_e32 v11, s20 -; SI-NEXT: v_mov_b32_e32 v12, s21 -; SI-NEXT: v_mov_b32_e32 v9, s22 -; SI-NEXT: v_mov_b32_e32 v10, s23 -; SI-NEXT: v_mov_b32_e32 v7, s24 -; SI-NEXT: v_mov_b32_e32 v8, s25 -; SI-NEXT: v_mov_b32_e32 v5, s26 -; SI-NEXT: v_mov_b32_e32 v6, s27 -; SI-NEXT: v_mov_b32_e32 v3, s28 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v4, s29 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB77_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 +; SI-NEXT: v_lshr_b64 v[18:19], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[4:5], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB77_3 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[2:3], v[3:4], 1.0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_lshr_b64 v[18:19], v[0:1], 16 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_lshr_b64 v[19:20], v[12:13], 16 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_lshr_b64 v[20:21], v[10:11], 16 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_lshr_b64 v[21:22], v[8:9], 16 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; SI-NEXT: .LBB77_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v51 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v34 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v31 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v26 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v27 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v22 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v23 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v18 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_or_b32_e32 v17, v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v16 +; SI-NEXT: v_mov_b32_e32 v1, v17 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB77_4: -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_branch .LBB77_2 ; ; VI-LABEL: bitcast_v8f64_to_v32f16_scalar: @@ -48649,126 +47669,110 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v8f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v17, v14 +; SI-NEXT: v_mov_b32_e32 v18, v13 +; SI-NEXT: v_mov_b32_e32 v19, v12 +; SI-NEXT: v_mov_b32_e32 v20, v11 +; SI-NEXT: v_mov_b32_e32 v21, v10 +; SI-NEXT: v_mov_b32_e32 v22, v9 +; SI-NEXT: v_mov_b32_e32 v23, v8 +; SI-NEXT: v_mov_b32_e32 v24, v7 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v26, v5 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v30, v1 +; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v31 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v2, v51, v2 -; SI-NEXT: v_or_b32_e32 v3, v49, v3 -; SI-NEXT: v_or_b32_e32 v4, v39, v4 -; SI-NEXT: v_or_b32_e32 v5, v37, v5 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 -; SI-NEXT: v_or_b32_e32 v7, v33, v7 -; SI-NEXT: v_or_b32_e32 v8, v31, v8 -; SI-NEXT: v_or_b32_e32 v9, v29, v9 -; SI-NEXT: v_or_b32_e32 v10, v27, v10 -; SI-NEXT: v_or_b32_e32 v11, v25, v11 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 @@ -48785,30 +47789,14 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: .LBB78_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -48821,10 +47809,10 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -48832,10 +47820,10 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -48843,11 +47831,11 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -48855,11 +47843,11 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 @@ -48867,11 +47855,11 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v39 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -48879,11 +47867,11 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v22 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -48891,11 +47879,11 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -48903,11 +47891,11 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -48915,12 +47903,12 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -49081,248 +48069,216 @@ define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, ; SI-LABEL: bitcast_v32f16_to_v8f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: s_lshr_b32 s10, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: s_lshr_b32 s10, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 -; SI-NEXT: s_lshr_b32 s10, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: s_lshr_b32 s10, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 ; SI-NEXT: s_cbranch_scc0 .LBB79_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v50, v2 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 -; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v7, v31, v7 -; SI-NEXT: v_or_b32_e32 v8, v30, v8 -; SI-NEXT: v_or_b32_e32 v9, v27, v9 -; SI-NEXT: v_or_b32_e32 v10, v26, v10 -; SI-NEXT: v_or_b32_e32 v11, v23, v11 -; SI-NEXT: v_or_b32_e32 v12, v22, v12 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB79_3 ; SI-NEXT: .LBB79_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v27 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 @@ -60811,6 +59767,22 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v32f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -60827,92 +59799,95 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v27 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v26 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v21 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB88_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v36, v1, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v34, v1, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v55, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v32, v1, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v51, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v29, v1, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v39, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v26, v1, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v35, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v23, v1, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v33, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v21, v1, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v30, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v16, v1, v63 +; SI-NEXT: v_or_b32_e32 v27, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_alignbit_b32 v37, v36, v40, 16 +; SI-NEXT: v_alignbit_b32 v38, v34, v41, 16 +; SI-NEXT: v_alignbit_b32 v48, v32, v43, 16 +; SI-NEXT: v_alignbit_b32 v49, v29, v45, 16 +; SI-NEXT: v_alignbit_b32 v50, v26, v47, 16 +; SI-NEXT: v_alignbit_b32 v52, v23, v57, 16 +; SI-NEXT: v_alignbit_b32 v53, v21, v59, 16 +; SI-NEXT: v_alignbit_b32 v54, v16, v61, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v61 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -60929,90 +59904,107 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: .LBB88_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB88_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v62 ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v60 ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v58 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v57 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v56 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v46 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v44 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v42 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v40 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v14, v61, v14 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v8, v47, v8 +; SI-NEXT: v_or_b32_e32 v6, v45, v6 +; SI-NEXT: v_or_b32_e32 v4, v43, v4 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v14, v63, v14 +; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_or_b32_e32 v10, v60, v10 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_or_b32_e32 v6, v56, v6 +; SI-NEXT: v_or_b32_e32 v4, v46, v4 +; SI-NEXT: v_or_b32_e32 v2, v44, v2 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v37, v36, v55, 16 +; SI-NEXT: v_alignbit_b32 v38, v34, v51, 16 +; SI-NEXT: v_alignbit_b32 v48, v32, v39, 16 +; SI-NEXT: v_alignbit_b32 v49, v29, v35, 16 +; SI-NEXT: v_alignbit_b32 v50, v26, v33, 16 +; SI-NEXT: v_alignbit_b32 v52, v23, v30, 16 +; SI-NEXT: v_alignbit_b32 v53, v21, v27, 16 +; SI-NEXT: v_alignbit_b32 v54, v16, v24, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; SI-NEXT: .LBB88_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload @@ -61031,70 +60023,54 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v31 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v51 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v36 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v53 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v39 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v52 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v54 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -61239,223 +60215,299 @@ define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i ; SI-LABEL: bitcast_v32i16_to_v32f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s30, 0 +; SI-NEXT: v_writelane_b32 v16, s31, 1 +; SI-NEXT: v_writelane_b32 v16, s34, 2 +; SI-NEXT: v_writelane_b32 v16, s35, 3 +; SI-NEXT: v_writelane_b32 v16, s36, 4 +; SI-NEXT: v_writelane_b32 v16, s37, 5 +; SI-NEXT: v_writelane_b32 v16, s38, 6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: s_lshr_b32 s92, s29, 16 +; SI-NEXT: s_lshr_b32 s36, s28, 16 +; SI-NEXT: s_lshr_b32 s91, s27, 16 +; SI-NEXT: s_lshr_b32 s35, s26, 16 +; SI-NEXT: s_lshr_b32 s90, s25, 16 +; SI-NEXT: s_lshr_b32 s34, s24, 16 +; SI-NEXT: s_lshr_b32 s89, s23, 16 +; SI-NEXT: s_lshr_b32 s31, s22, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 16 +; SI-NEXT: s_lshr_b32 s30, s20, 16 +; SI-NEXT: s_lshr_b32 s79, s19, 16 +; SI-NEXT: s_lshr_b32 s95, s18, 16 +; SI-NEXT: s_lshr_b32 s78, s17, 16 +; SI-NEXT: s_lshr_b32 s94, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_writelane_b32 v16, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s37, v1 +; SI-NEXT: v_readfirstlane_b32 s38, v0 +; SI-NEXT: v_readfirstlane_b32 s93, v3 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; SI-NEXT: v_readfirstlane_b32 s39, v4 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v34 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s78, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s44, s94, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s79, 16 +; SI-NEXT: s_or_b32 s14, s4, s44 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s46, s95, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s88, 16 +; SI-NEXT: s_or_b32 s12, s4, s46 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s56, s30, 16 +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s89, 16 +; SI-NEXT: s_or_b32 s10, s4, s56 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s58, s31, 16 +; SI-NEXT: s_or_b32 s59, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s90, 16 +; SI-NEXT: s_or_b32 s8, s4, s58 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s60, s34, 16 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s7, s91, 16 +; SI-NEXT: s_or_b32 s6, s4, s60 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s42, s35, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s29, 0xffff +; SI-NEXT: s_lshl_b32 s7, s92, 16 +; SI-NEXT: s_or_b32 s4, s4, s42 +; SI-NEXT: s_lshl_b32 s40, s36, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s37, 0xffff +; SI-NEXT: s_lshl_b32 s7, s93, 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[42:43], 16 +; SI-NEXT: s_and_b32 s42, s28, 0xffff +; SI-NEXT: s_or_b32 s75, s5, s7 +; SI-NEXT: s_lshl_b32 s74, s39, 16 +; SI-NEXT: s_or_b32 s42, s42, s40 +; SI-NEXT: s_lshr_b64 s[72:73], s[40:41], 16 +; SI-NEXT: s_and_b32 s40, s38, 0xffff +; SI-NEXT: s_mov_b32 s15, s45 +; SI-NEXT: s_lshr_b64 s[44:45], s[44:45], 16 +; SI-NEXT: s_mov_b32 s13, s47 +; SI-NEXT: s_lshr_b64 s[46:47], s[46:47], 16 +; SI-NEXT: s_mov_b32 s11, s57 +; SI-NEXT: s_lshr_b64 s[56:57], s[56:57], 16 +; SI-NEXT: s_mov_b32 s9, s59 +; SI-NEXT: s_lshr_b64 s[58:59], s[58:59], 16 +; SI-NEXT: s_mov_b32 s7, s61 +; SI-NEXT: s_lshr_b64 s[60:61], s[60:61], 16 +; SI-NEXT: s_mov_b32 s5, s43 +; SI-NEXT: s_mov_b32 s43, s41 +; SI-NEXT: s_or_b32 s40, s40, s74 +; SI-NEXT: s_mov_b32 s41, s75 +; SI-NEXT: s_lshr_b64 s[74:75], s[74:75], 16 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s38, s38, 3 +; SI-NEXT: s_and_b32 s4, s38, 0xffff +; SI-NEXT: s_lshl_b32 s5, s39, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s37, s37, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s37, 0xffff +; SI-NEXT: s_lshl_b32 s5, s93, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s36, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s92, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s35, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s6, s91, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xffff +; SI-NEXT: s_lshl_b32 s7, s34, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s25, 0xffff +; SI-NEXT: s_lshl_b32 s8, s90, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s22, 0xffff +; SI-NEXT: s_lshl_b32 s9, s31, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s23, 0xffff +; SI-NEXT: s_lshl_b32 s10, s89, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s20, 0xffff +; SI-NEXT: s_lshl_b32 s11, s30, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s21, 0xffff +; SI-NEXT: s_lshl_b32 s12, s88, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s18, 0xffff +; SI-NEXT: s_lshl_b32 s13, s95, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s19, 0xffff +; SI-NEXT: s_lshl_b32 s14, s79, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s16, 0xffff +; SI-NEXT: s_lshl_b32 s15, s94, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s78, 16 +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[40:41], 16 +; SI-NEXT: s_lshr_b32 s78, s15, 16 +; SI-NEXT: s_lshr_b32 s79, s13, 16 +; SI-NEXT: s_lshr_b32 s88, s11, 16 +; SI-NEXT: s_lshr_b32 s89, s9, 16 +; SI-NEXT: s_lshr_b32 s90, s7, 16 +; SI-NEXT: s_lshr_b32 s91, s5, 16 +; SI-NEXT: s_lshr_b32 s92, s43, 16 +; SI-NEXT: s_lshr_b32 s93, s41, 16 ; SI-NEXT: .LBB89_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v25 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v27 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v29 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v31 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s44, 16 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s16, s78, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s16, s46, 16 +; SI-NEXT: s_or_b32 s12, s12, s16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s16, s79, 16 +; SI-NEXT: s_or_b32 s13, s13, s16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s16, s56, 16 +; SI-NEXT: s_or_b32 s10, s10, s16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s16, s88, 16 +; SI-NEXT: s_or_b32 s11, s11, s16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s16, s58, 16 +; SI-NEXT: s_or_b32 s8, s8, s16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s16, s89, 16 +; SI-NEXT: s_or_b32 s9, s9, s16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s16, s60, 16 +; SI-NEXT: s_or_b32 s6, s6, s16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s16, s90, 16 +; SI-NEXT: s_or_b32 s7, s7, s16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s16, s62, 16 +; SI-NEXT: s_or_b32 s4, s4, s16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s16, s91, 16 +; SI-NEXT: s_or_b32 s5, s5, s16 +; SI-NEXT: s_and_b32 s16, s42, 0xffff +; SI-NEXT: s_lshl_b32 s17, s72, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s43, 0xffff +; SI-NEXT: s_lshl_b32 s18, s92, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s40, 0xffff +; SI-NEXT: s_lshl_b32 s19, s74, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s41, 0xffff +; SI-NEXT: s_lshl_b32 s20, s93, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: v_mov_b32_e32 v0, s14 +; SI-NEXT: v_mov_b32_e32 v1, s15 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: v_mov_b32_e32 v3, s13 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v9, s7 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: v_mov_b32_e32 v11, s5 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_readlane_b32 s39, v16, 7 +; SI-NEXT: v_readlane_b32 s38, v16, 6 +; SI-NEXT: v_readlane_b32 s37, v16, 5 +; SI-NEXT: v_readlane_b32 s36, v16, 4 +; SI-NEXT: v_readlane_b32 s35, v16, 3 +; SI-NEXT: v_readlane_b32 s34, v16, 2 +; SI-NEXT: v_readlane_b32 s31, v16, 1 +; SI-NEXT: v_readlane_b32 s30, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB89_4: -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: s_branch .LBB89_2 ; ; VI-LABEL: bitcast_v32i16_to_v32f16_scalar: @@ -61710,276 +60762,212 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB90_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v15, v15, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_or_b32_e32 v13, v13, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_or_b32_e32 v11, v11, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_or_b32_e32 v9, v9, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_or_b32_e32 v7, v7, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v5, v5, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 ; SI-NEXT: v_or_b32_e32 v3, v3, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v32 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v0, v0, v31 -; SI-NEXT: v_or_b32_e32 v24, v24, v30 -; SI-NEXT: v_or_b32_e32 v22, v22, v29 -; SI-NEXT: v_or_b32_e32 v21, v21, v28 -; SI-NEXT: v_or_b32_e32 v20, v20, v27 -; SI-NEXT: v_or_b32_e32 v18, v18, v26 -; SI-NEXT: v_or_b32_e32 v19, v19, v25 -; SI-NEXT: v_or_b32_e32 v17, v17, v23 +; SI-NEXT: v_or_b32_e32 v2, v2, v30 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_or_b32_e32 v6, v6, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v27 +; SI-NEXT: v_or_b32_e32 v10, v10, v25 +; SI-NEXT: v_or_b32_e32 v12, v12, v22 +; SI-NEXT: v_or_b32_e32 v14, v14, v20 ; SI-NEXT: v_alignbit_b32 v31, v1, v31, 16 ; SI-NEXT: v_alignbit_b32 v30, v3, v30, 16 ; SI-NEXT: v_alignbit_b32 v29, v5, v29, 16 ; SI-NEXT: v_alignbit_b32 v28, v7, v28, 16 ; SI-NEXT: v_alignbit_b32 v27, v9, v27, 16 -; SI-NEXT: v_alignbit_b32 v26, v11, v26, 16 -; SI-NEXT: v_alignbit_b32 v25, v13, v25, 16 -; SI-NEXT: v_alignbit_b32 v23, v15, v23, 16 +; SI-NEXT: v_alignbit_b32 v25, v11, v25, 16 +; SI-NEXT: v_alignbit_b32 v22, v13, v22, 16 +; SI-NEXT: v_alignbit_b32 v20, v15, v20, 16 ; SI-NEXT: .LBB90_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v29 +; SI-NEXT: v_or_b32_e32 v4, v4, v16 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v28 +; SI-NEXT: v_or_b32_e32 v6, v6, v16 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; SI-NEXT: v_or_b32_e32 v7, v7, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v27 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v25 +; SI-NEXT: v_or_b32_e32 v10, v10, v16 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v12, v12, v16 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v13, v13, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v20 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 -; SI-NEXT: v_or_b32_e32 v10, v10, v18 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v30 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v0, v31 -; SI-NEXT: v_or_b32_e32 v2, v2, v24 -; SI-NEXT: v_or_b32_e32 v4, v4, v22 -; SI-NEXT: v_or_b32_e32 v6, v6, v21 -; SI-NEXT: v_or_b32_e32 v8, v8, v20 -; SI-NEXT: v_or_b32_e32 v12, v12, v18 -; SI-NEXT: v_or_b32_e32 v14, v14, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v26 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -62125,280 +61113,248 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; SI-LABEL: bitcast_v32f16_to_v32i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s26 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s10, s28, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s11, s26, 16 +; SI-NEXT: s_lshr_b32 s14, s25, 16 +; SI-NEXT: s_lshr_b32 s13, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s22, 16 +; SI-NEXT: s_lshr_b32 s9, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s20, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 -; SI-NEXT: s_lshr_b32 s6, s24, 16 -; SI-NEXT: s_lshr_b32 s7, s22, 16 -; SI-NEXT: s_lshr_b32 s8, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s18, 16 -; SI-NEXT: s_lshr_b32 s10, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB91_4 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: s_cbranch_scc0 .LBB91_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB91_3 +; SI-NEXT: s_cbranch_execnz .LBB91_4 ; SI-NEXT: .LBB91_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v14, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v34 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s14 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v55 -; SI-NEXT: v_or_b32_e32 v52, v16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s40 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: v_or_b32_e32 v11, v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_or_b32_e32 v13, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v7 +; SI-NEXT: v_or_b32_e32 v7, v17, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s7 +; SI-NEXT: v_or_b32_e32 v3, v18, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v19 +; SI-NEXT: v_or_b32_e32 v5, v17, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v19, v17, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v20 -; SI-NEXT: v_or_b32_e32 v55, v17, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 ; SI-NEXT: v_or_b32_e32 v53, v16, v4 -; SI-NEXT: v_or_b32_e32 v49, v18, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_or_b32_e32 v50, v17, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 +; SI-NEXT: v_or_b32_e32 v1, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 +; SI-NEXT: v_or_b32_e32 v51, v20, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v54, v17, v8 -; SI-NEXT: v_or_b32_e32 v50, v16, v10 -; SI-NEXT: v_or_b32_e32 v38, v18, v12 -; SI-NEXT: v_or_b32_e32 v36, v19, v14 -; SI-NEXT: v_lshr_b64 v[30:31], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[2:3], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[4:5], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[6:7], 16 -; SI-NEXT: v_lshr_b64 v[22:23], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[20:21], v[10:11], 16 -; SI-NEXT: v_lshr_b64 v[18:19], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[16:17], v[14:15], 16 -; SI-NEXT: .LBB91_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v52, v16, v12 +; SI-NEXT: v_or_b32_e32 v49, v17, v10 +; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16 +; SI-NEXT: v_or_b32_e32 v48, v21, v8 +; SI-NEXT: v_or_b32_e32 v39, v20, v18 +; SI-NEXT: v_lshr_b64 v[30:31], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[0:1], 16 +; SI-NEXT: v_or_b32_e32 v15, v15, v0 +; SI-NEXT: s_branch .LBB91_5 +; SI-NEXT: .LBB91_3: +; SI-NEXT: s_branch .LBB91_2 +; SI-NEXT: .LBB91_4: +; SI-NEXT: v_mov_b32_e32 v38, s6 +; SI-NEXT: v_mov_b32_e32 v14, s40 +; SI-NEXT: v_mov_b32_e32 v33, s14 +; SI-NEXT: v_mov_b32_e32 v34, s12 +; SI-NEXT: v_mov_b32_e32 v35, s9 +; SI-NEXT: v_mov_b32_e32 v36, s8 +; SI-NEXT: v_mov_b32_e32 v37, s7 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v7, s21 +; SI-NEXT: v_mov_b32_e32 v39, s28 +; SI-NEXT: v_mov_b32_e32 v49, s26 +; SI-NEXT: v_mov_b32_e32 v52, s24 +; SI-NEXT: v_mov_b32_e32 v48, s22 +; SI-NEXT: v_mov_b32_e32 v50, s20 +; SI-NEXT: v_mov_b32_e32 v51, s18 +; SI-NEXT: v_mov_b32_e32 v53, s16 +; SI-NEXT: v_mov_b32_e32 v9, s23 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: v_mov_b32_e32 v16, s43 +; SI-NEXT: v_mov_b32_e32 v30, s42 +; SI-NEXT: v_mov_b32_e32 v28, s41 +; SI-NEXT: v_mov_b32_e32 v26, s15 +; SI-NEXT: v_mov_b32_e32 v24, s13 +; SI-NEXT: v_mov_b32_e32 v22, s11 +; SI-NEXT: v_mov_b32_e32 v20, s10 +; SI-NEXT: .LBB91_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_or_b32_e32 v16, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v22 -; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v33 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v20 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 ; SI-NEXT: v_or_b32_e32 v10, v10, v12 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v38 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v1, v15 +; SI-NEXT: v_mov_b32_e32 v1, v16 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB91_4: -; SI-NEXT: s_branch .LBB91_2 ; ; VI-LABEL: bitcast_v32f16_to_v32i16_scalar: ; VI: ; %bb.0: @@ -75409,54 +74365,6 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v32bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -75473,131 +74381,109 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v12 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v13 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v18 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v17 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB100_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v63 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 @@ -75617,215 +74503,215 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB100_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v0 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v12 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v20 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v46 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v45 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v43 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v15 ; SI-NEXT: .LBB100_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v18 ; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v25 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v20 ; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v27 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v22 ; SI-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v29 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v24 ; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v31 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v26 ; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v33 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v28 ; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v35 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v30 ; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v37 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v32 ; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v39 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v34 ; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v49 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v36 ; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v51 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v38 ; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v53 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v48 ; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v55 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v50 ; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v40 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v52 ; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -75972,392 +74858,318 @@ define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg % ; SI-LABEL: bitcast_v32f16_to_v32bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: s_lshr_b32 s10, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 +; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: s_lshr_b32 s15, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 ; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 -; SI-NEXT: s_lshr_b32 s10, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: s_lshr_b32 s10, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: s_lshr_b32 s10, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v5 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v6 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v4 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v14 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v3 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB101_4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: s_cbranch_scc0 .LBB101_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: s_lshl_b32 s44, s16, 16 +; SI-NEXT: s_lshl_b32 s45, s6, 16 +; SI-NEXT: s_lshl_b32 s46, s17, 16 +; SI-NEXT: s_lshl_b32 s47, s7, 16 +; SI-NEXT: s_lshl_b32 s56, s18, 16 +; SI-NEXT: s_lshl_b32 s57, s8, 16 +; SI-NEXT: s_lshl_b32 s58, s19, 16 +; SI-NEXT: s_lshl_b32 s59, s9, 16 +; SI-NEXT: s_lshl_b32 s60, s20, 16 +; SI-NEXT: s_lshl_b32 s61, s10, 16 +; SI-NEXT: s_lshl_b32 s62, s21, 16 +; SI-NEXT: s_lshl_b32 s63, s11, 16 +; SI-NEXT: s_lshl_b32 s72, s22, 16 +; SI-NEXT: s_lshl_b32 s73, s12, 16 +; SI-NEXT: s_lshl_b32 s74, s23, 16 +; SI-NEXT: s_lshl_b32 s75, s13, 16 +; SI-NEXT: s_lshl_b32 s76, s24, 16 +; SI-NEXT: s_lshl_b32 s77, s14, 16 +; SI-NEXT: s_lshl_b32 s78, s25, 16 +; SI-NEXT: s_lshl_b32 s79, s15, 16 +; SI-NEXT: s_lshl_b32 s88, s26, 16 +; SI-NEXT: s_lshl_b32 s89, s40, 16 +; SI-NEXT: s_lshl_b32 s90, s27, 16 +; SI-NEXT: s_lshl_b32 s91, s41, 16 +; SI-NEXT: s_lshl_b32 s92, s28, 16 +; SI-NEXT: s_lshl_b32 s93, s42, 16 +; SI-NEXT: s_lshl_b32 s94, s29, 16 +; SI-NEXT: s_lshl_b32 s95, s43, 16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v63 -; SI-NEXT: s_cbranch_execnz .LBB101_3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: s_cbranch_execnz .LBB101_4 ; SI-NEXT: .LBB101_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s43 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s41 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s15 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v15 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v19 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v25 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v31 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v13 -; SI-NEXT: .LBB101_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_lshr_b64 v[3:4], v[4:5], 16 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v11 -; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v9 -; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v17 -; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v15 -; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v22 -; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v19 -; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v32 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v27 -; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v25 -; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v36 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v33 -; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v34 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v31 -; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v39 -; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v49 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v38 -; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB101_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_branch .LBB101_5 +; SI-NEXT: .LBB101_3: +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_branch .LBB101_2 +; SI-NEXT: .LBB101_4: +; SI-NEXT: v_mov_b32_e32 v1, s95 +; SI-NEXT: v_mov_b32_e32 v0, s94 +; SI-NEXT: v_mov_b32_e32 v7, s93 +; SI-NEXT: v_mov_b32_e32 v6, s92 +; SI-NEXT: v_mov_b32_e32 v9, s91 +; SI-NEXT: v_mov_b32_e32 v8, s90 +; SI-NEXT: v_mov_b32_e32 v11, s89 +; SI-NEXT: v_mov_b32_e32 v10, s88 +; SI-NEXT: v_mov_b32_e32 v13, s79 +; SI-NEXT: v_mov_b32_e32 v12, s78 +; SI-NEXT: v_mov_b32_e32 v15, s77 +; SI-NEXT: v_mov_b32_e32 v14, s76 +; SI-NEXT: v_mov_b32_e32 v17, s75 +; SI-NEXT: v_mov_b32_e32 v16, s74 +; SI-NEXT: v_mov_b32_e32 v19, s73 +; SI-NEXT: v_mov_b32_e32 v18, s72 +; SI-NEXT: v_mov_b32_e32 v21, s63 +; SI-NEXT: v_mov_b32_e32 v20, s62 +; SI-NEXT: v_mov_b32_e32 v23, s61 +; SI-NEXT: v_mov_b32_e32 v22, s60 +; SI-NEXT: v_mov_b32_e32 v25, s59 +; SI-NEXT: v_mov_b32_e32 v24, s58 +; SI-NEXT: v_mov_b32_e32 v27, s57 +; SI-NEXT: v_mov_b32_e32 v26, s56 +; SI-NEXT: v_mov_b32_e32 v29, s47 +; SI-NEXT: v_mov_b32_e32 v28, s46 +; SI-NEXT: v_mov_b32_e32 v31, s45 +; SI-NEXT: v_mov_b32_e32 v30, s44 +; SI-NEXT: .LBB101_5: ; %end +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[28:29], 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_lshr_b64 v[3:4], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_lshr_b64 v[4:5], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_lshr_b64 v[6:7], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[7:8], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshr_b64 v[8:9], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[9:10], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[50:51], 16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f16_to_v32bf16_scalar: ; VI: ; %bb.0: @@ -76581,7 +75393,7 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill @@ -76600,309 +75412,271 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v31 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v41, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v12 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v20 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v13 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v19 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v18 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v18 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v14 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_alignbit_b32 v23, v1, v33, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v47 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_alignbit_b32 v24, v23, v0, 16 +; SI-NEXT: v_alignbit_b32 v22, v3, v35, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v36 +; SI-NEXT: v_alignbit_b32 v25, v22, v0, 16 +; SI-NEXT: v_alignbit_b32 v21, v5, v37, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v44 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v38 +; SI-NEXT: v_alignbit_b32 v26, v21, v0, 16 +; SI-NEXT: v_alignbit_b32 v20, v7, v39, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v42 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_alignbit_b32 v27, v20, v0, 16 +; SI-NEXT: v_alignbit_b32 v19, v9, v49, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v41 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v50 +; SI-NEXT: v_alignbit_b32 v28, v19, v0, 16 +; SI-NEXT: v_alignbit_b32 v17, v11, v51, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_alignbit_b32 v29, v17, v0, 16 +; SI-NEXT: v_alignbit_b32 v14, v13, v54, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v43 +; SI-NEXT: v_alignbit_b32 v30, v14, v0, 16 +; SI-NEXT: v_alignbit_b32 v16, v15, v45, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53 +; SI-NEXT: v_alignbit_b32 v32, v16, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v53 +; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v62, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v61, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v60, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v59, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v58, 16 +; SI-NEXT: v_alignbit_b32 v12, v12, v57, 16 +; SI-NEXT: v_alignbit_b32 v18, v18, v56, 16 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: .LBB102_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v48 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v46 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v44 +; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v42 +; SI-NEXT: v_alignbit_b32 v4, v4, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v41 +; SI-NEXT: v_alignbit_b32 v6, v6, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 +; SI-NEXT: v_alignbit_b32 v8, v8, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 +; SI-NEXT: v_alignbit_b32 v10, v10, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v53 +; SI-NEXT: v_alignbit_b32 v12, v12, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_alignbit_b32 v18, v14, v1, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v52 +; SI-NEXT: v_alignbit_b32 v16, v15, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 +; SI-NEXT: v_alignbit_b32 v14, v13, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v48 +; SI-NEXT: v_alignbit_b32 v17, v11, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v38 +; SI-NEXT: v_alignbit_b32 v19, v9, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 +; SI-NEXT: v_alignbit_b32 v20, v7, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v34 +; SI-NEXT: v_alignbit_b32 v21, v5, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v22, v3, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v23, v1, v23, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_alignbit_b32 v24, v23, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v22, v25, 16 +; SI-NEXT: v_alignbit_b32 v26, v21, v26, 16 +; SI-NEXT: v_alignbit_b32 v27, v20, v27, 16 +; SI-NEXT: v_alignbit_b32 v28, v19, v28, 16 +; SI-NEXT: v_alignbit_b32 v29, v17, v29, 16 +; SI-NEXT: v_alignbit_b32 v30, v14, v30, 16 +; SI-NEXT: v_alignbit_b32 v32, v16, v32, 16 ; SI-NEXT: .LBB102_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload @@ -76921,70 +75695,54 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v19 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v16 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v23 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v24 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v30 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v28 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; SI-NEXT: v_or_b32_e32 v5, v21, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 +; SI-NEXT: v_or_b32_e32 v7, v20, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v9, v19, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v17 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v0, v0, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v21 +; SI-NEXT: v_or_b32_e32 v8, v8, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v19 +; SI-NEXT: v_or_b32_e32 v14, v14, v17 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -78118,10 +76876,11 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; SI-LABEL: bitcast_v32bf16_to_v32f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v1 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; SI-NEXT: s_and_b32 s42, s17, 0xffff0000 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_and_b32 s6, s29, 0xffff0000 ; SI-NEXT: s_lshl_b32 s7, s29, 16 ; SI-NEXT: s_and_b32 s8, s28, 0xffff0000 @@ -78146,11 +76905,11 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_and_b32 s41, s18, 0xffff0000 ; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_and_b32 s42, s17, 0xffff0000 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_and_b32 s43, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s42 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -78168,295 +76927,431 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s42 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s41 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s40 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s26 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v15 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v14 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v12 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v41, 1.0, s15 -; SI-NEXT: v_mul_f32_e64 v43, 1.0, s14 -; SI-NEXT: v_mul_f32_e64 v44, 1.0, s13 -; SI-NEXT: v_mul_f32_e64 v45, 1.0, s12 -; SI-NEXT: v_mul_f32_e64 v46, 1.0, s11 -; SI-NEXT: v_mul_f32_e64 v47, 1.0, s10 -; SI-NEXT: v_mul_f32_e64 v56, 1.0, s9 -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s8 -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s7 -; SI-NEXT: v_mul_f32_e64 v62, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s43 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s41 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s14 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s12 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s10 +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s6 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v43, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s9 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB103_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v34 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v36 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v48 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v50 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v53 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[38:39], 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v42 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v9 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v54 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v11 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_lshr_b64 v[34:35], v[48:49], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_lshr_b64 v[2:3], v[33:34], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[5:6], v[43:44], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v5 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[44:45], v[28:29], 16 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v63 +; SI-NEXT: v_lshr_b64 v[21:22], v[12:13], 16 +; SI-NEXT: v_mov_b32_e32 v22, v42 +; SI-NEXT: v_lshr_b64 v[41:42], v[46:47], 16 +; SI-NEXT: v_mov_b32_e32 v28, v60 +; SI-NEXT: v_lshr_b64 v[31:32], v[26:27], 16 +; SI-NEXT: v_mov_b32_e32 v45, v40 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v16 +; SI-NEXT: v_mov_b32_e32 v29, v61 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_mov_b32_e32 v26, v59 +; SI-NEXT: v_mov_b32_e32 v6, v51 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_mov_b32_e32 v27, v60 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; SI-NEXT: v_mov_b32_e32 v12, v9 +; SI-NEXT: v_mov_b32_e32 v8, v58 +; SI-NEXT: v_mov_b32_e32 v9, v59 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v53 +; SI-NEXT: v_mov_b32_e32 v60, v58 +; SI-NEXT: v_mov_b32_e32 v59, v57 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v19 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v57, v56 +; SI-NEXT: v_mov_b32_e32 v32, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v53 +; SI-NEXT: v_mov_b32_e32 v62, v47 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v63 +; SI-NEXT: v_mov_b32_e32 v61, v46 +; SI-NEXT: v_mov_b32_e32 v47, v11 +; SI-NEXT: v_mov_b32_e32 v0, v16 +; SI-NEXT: v_lshr_b64 v[10:11], v[30:31], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_mov_b32_e32 v42, v22 +; SI-NEXT: v_mov_b32_e32 v30, v55 +; SI-NEXT: v_mov_b32_e32 v11, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[43:44], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[40:41], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[26:27], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[24:25], v[36:37], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[8:9], 16 +; SI-NEXT: v_mov_b32_e32 v9, v12 +; SI-NEXT: v_lshr_b64 v[12:13], v[57:58], 16 +; SI-NEXT: v_mov_b32_e32 v25, v18 +; SI-NEXT: v_mov_b32_e32 v18, v53 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshr_b64 v[15:16], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[20:21], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[22:23], v[55:56], 16 +; SI-NEXT: v_lshr_b64 v[55:56], v[52:53], 16 +; SI-NEXT: v_mov_b32_e32 v40, v45 +; SI-NEXT: v_lshr_b64 v[45:46], v[50:51], 16 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[28:29], 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshr_b64 v[2:3], v[59:60], 16 +; SI-NEXT: v_mov_b32_e32 v56, v57 +; SI-NEXT: v_mov_b32_e32 v57, v59 +; SI-NEXT: v_mov_b32_e32 v59, v26 +; SI-NEXT: v_mov_b32_e32 v60, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v46, v61 +; SI-NEXT: v_mov_b32_e32 v16, v0 +; SI-NEXT: v_mov_b32_e32 v20, v32 +; SI-NEXT: v_mov_b32_e32 v32, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v52 +; SI-NEXT: v_mov_b32_e32 v47, v62 +; SI-NEXT: v_mov_b32_e32 v61, v4 +; SI-NEXT: v_mov_b32_e32 v41, v31 +; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: v_mov_b32_e32 v22, v15 +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: v_mov_b32_e32 v53, v18 +; SI-NEXT: v_mov_b32_e32 v18, v25 +; SI-NEXT: v_mov_b32_e32 v52, v2 ; SI-NEXT: s_cbranch_execnz .LBB103_3 ; SI-NEXT: .LBB103_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v13 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v60 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshr_b64 v[60:61], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshr_b64 v[52:53], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_lshr_b64 v[55:56], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v3 +; SI-NEXT: v_lshr_b64 v[10:11], v[43:44], 16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshr_b64 v[57:58], v[46:47], 16 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[17:18], v[28:29], 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v8 +; SI-NEXT: v_lshr_b64 v[21:22], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[26:27], 16 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v4 +; SI-NEXT: v_lshr_b64 v[31:32], v[40:41], 16 +; SI-NEXT: v_lshr_b64 v[61:62], v[9:10], 16 +; SI-NEXT: v_mov_b32_e32 v5, v60 +; SI-NEXT: v_mov_b32_e32 v32, v57 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[38:39], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[23:24], 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; SI-NEXT: v_lshr_b64 v[34:35], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[33:34], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[16:17], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[56:57], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: .LBB103_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v37 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v41 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v27 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v28 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v39 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v55 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v42 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v40 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v31 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v44 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v47 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -78473,49 +77368,57 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v53 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v50 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB103_4: -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_mov_b32_e32 v30, v55 +; SI-NEXT: v_mov_b32_e32 v23, v52 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_branch .LBB103_2 ; ; VI-LABEL: bitcast_v32bf16_to_v32f16_scalar: @@ -79886,97 +78789,49 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v64i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v31 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -79993,690 +78848,691 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v26, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; SI-NEXT: ; kill: killed $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB104_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 -; SI-NEXT: v_or_b32_e32 v54, v25, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_or_b32_e32 v50, v23, v9 -; SI-NEXT: v_alignbit_b32 v9, v50, v54, 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v50, v54, 8 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v58 +; SI-NEXT: v_or_b32_e32 v57, v1, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v45, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v45, v57, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v29 -; SI-NEXT: v_or_b32_e32 v22, v28, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_or_b32_e32 v21, v27, v9 -; SI-NEXT: v_alignbit_b32 v9, v21, v22, 24 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v45, v57, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v21, v22, 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v45, v57, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v21, v22, 8 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v38, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v37, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; SI-NEXT: v_or_b32_e32 v19, v31, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_or_b32_e32 v20, v30, v9 -; SI-NEXT: v_alignbit_b32 v9, v20, v19, 24 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v20, v19, 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v20, v19, 8 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_or_b32_e32 v35, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v36, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v35 -; SI-NEXT: v_or_b32_e32 v17, v34, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v5 -; SI-NEXT: v_or_b32_e32 v18, v33, v9 -; SI-NEXT: v_alignbit_b32 v9, v18, v17, 24 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v18, v17, 8 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v38 -; SI-NEXT: v_or_b32_e32 v16, v37, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_or_b32_e32 v15, v36, v9 -; SI-NEXT: v_alignbit_b32 v9, v15, v16, 24 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v15, v16, 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v15, v16, 8 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v49 -; SI-NEXT: v_or_b32_e32 v13, v48, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_or_b32_e32 v14, v39, v9 -; SI-NEXT: v_alignbit_b32 v9, v14, v13, 24 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v14, v13, 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v14, v13, 8 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v53 -; SI-NEXT: v_or_b32_e32 v11, v52, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_or_b32_e32 v12, v51, v9 -; SI-NEXT: v_alignbit_b32 v9, v12, v11, 24 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v12, v11, 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v9, v12, v11, 8 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v41 -; SI-NEXT: v_or_b32_e32 v10, v40, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_or_b32_e32 v9, v55, v9 -; SI-NEXT: v_alignbit_b32 v23, v9, v10, 24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v9, v10, 16 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v9, v10, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v12 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v9 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v23, v1, 8, 8 -; SI-NEXT: v_alignbit_b32 v43, v50, v54, 24 -; SI-NEXT: v_alignbit_b32 v42, v18, v17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v21 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v18 -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v15 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v14 -; SI-NEXT: v_bfe_u32 v24, v8, 8, 8 -; SI-NEXT: v_bfe_u32 v62, v7, 8, 8 -; SI-NEXT: v_bfe_u32 v61, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v59, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v57, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v47, v3, 8, 8 -; SI-NEXT: v_bfe_u32 v45, v2, 8, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_or_b32_e32 v33, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v34, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 +; SI-NEXT: v_or_b32_e32 v31, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v32, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v29, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_or_b32_e32 v30, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_or_b32_e32 v27, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_or_b32_e32 v28, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v17, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_or_b32_e32 v25, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v25, v17, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v17, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v17, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v45 +; SI-NEXT: v_lshrrev_b32_e32 v43, 8, v37 +; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v36 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v34 +; SI-NEXT: v_lshrrev_b32_e32 v53, 8, v32 +; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v56, v26, 8, 8 +; SI-NEXT: v_bfe_u32 v46, v24, 8, 8 +; SI-NEXT: v_bfe_u32 v44, v23, 8, 8 +; SI-NEXT: v_bfe_u32 v42, v22, 8, 8 +; SI-NEXT: v_bfe_u32 v40, v21, 8, 8 +; SI-NEXT: v_bfe_u32 v54, v20, 8, 8 +; SI-NEXT: v_bfe_u32 v52, v19, 8, 8 +; SI-NEXT: v_bfe_u32 v50, v18, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: .LBB104_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB104_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v55 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v10, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v15, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v63 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_or_b32_e32 v25, v16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v27, v13, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v16, v16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v4 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v30 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v28 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v28, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_or_b32_e32 v29, v11, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_or_b32_e32 v30, v11, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v27 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_or_b32_e32 v31, v9, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v32, v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v22, v22, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_or_b32_e32 v21, v24, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_or_b32_e32 v33, v7, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v54, v24, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_or_b32_e32 v50, v23, v24 -; SI-NEXT: v_alignbit_b32 v23, v50, v54, 16 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v34, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_or_b32_e32 v35, v5, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v36, v5, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v38, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v24 +; SI-NEXT: v_or_b32_e32 v37, v4, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v57, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_or_b32_e32 v45, v2, v1 +; SI-NEXT: v_alignbit_b32 v1, v45, v57, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v50, v54, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v45, v57, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v45, v57, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v21, v22, 24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v21, v22, 16 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v21, v22, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v20, v19, 24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v20, v19, 16 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v20, v19, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v18, v17, 24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v18, v17, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v15, v16, 24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v15, v16, 16 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v15, v16, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v14, v13, 24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v14, v13, 16 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v14, v13, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v12, v11, 24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v12, v11, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v9, v10, 24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v9, v10, 16 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v25, v17, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v9, v10, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v25, v17, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v12 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v25, v17, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v45 +; SI-NEXT: v_lshrrev_b32_e32 v43, 8, v37 +; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v36 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v34 +; SI-NEXT: v_lshrrev_b32_e32 v53, 8, v32 +; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v28 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v9 -; SI-NEXT: v_alignbit_b32 v43, v50, v54, 24 -; SI-NEXT: v_alignbit_b32 v42, v18, v17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v21 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v18 -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v15 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v14 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: v_bfe_u32 v24, v8, 8, 8 -; SI-NEXT: v_bfe_u32 v62, v7, 8, 8 -; SI-NEXT: v_bfe_u32 v61, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v59, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v57, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v47, v3, 8, 8 -; SI-NEXT: v_bfe_u32 v45, v2, 8, 8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v23, v1, 8, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; SI-NEXT: v_bfe_u32 v56, v26, 8, 8 +; SI-NEXT: v_bfe_u32 v46, v24, 8, 8 +; SI-NEXT: v_bfe_u32 v44, v23, 8, 8 +; SI-NEXT: v_bfe_u32 v42, v22, 8, 8 +; SI-NEXT: v_bfe_u32 v40, v21, 8, 8 +; SI-NEXT: v_bfe_u32 v54, v20, 8, 8 +; SI-NEXT: v_bfe_u32 v52, v19, 8, 8 +; SI-NEXT: v_bfe_u32 v50, v18, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: .LBB104_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v43 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 -; SI-NEXT: v_or_b32_e32 v8, v24, v8 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v57 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v63 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v8, v23, v8 -; SI-NEXT: v_add_i32_e32 v23, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v8, v23, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; SI-NEXT: v_or_b32_e32 v8, v8, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_or_b32_e32 v8, v8, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v8, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v60 -; SI-NEXT: v_or_b32_e32 v8, v8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v62 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v7, v21, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v19, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v58 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v61 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v42 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v56 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v59 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v46 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v57 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v13 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v44 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v47 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v11 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v50 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -81944,659 +80800,695 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-LABEL: bitcast_v32f16_to_v64i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v18, s30, 0 +; SI-NEXT: v_writelane_b32 v18, s31, 1 +; SI-NEXT: v_writelane_b32 v18, s34, 2 +; SI-NEXT: v_writelane_b32 v18, s35, 3 +; SI-NEXT: v_writelane_b32 v18, s36, 4 +; SI-NEXT: v_writelane_b32 v18, s37, 5 +; SI-NEXT: v_writelane_b32 v18, s38, 6 +; SI-NEXT: v_writelane_b32 v18, s39, 7 +; SI-NEXT: v_writelane_b32 v18, s48, 8 +; SI-NEXT: v_writelane_b32 v18, s49, 9 +; SI-NEXT: v_writelane_b32 v18, s50, 10 +; SI-NEXT: v_writelane_b32 v18, s51, 11 +; SI-NEXT: v_writelane_b32 v18, s52, 12 +; SI-NEXT: v_writelane_b32 v18, s53, 13 +; SI-NEXT: v_writelane_b32 v18, s54, 14 +; SI-NEXT: v_writelane_b32 v18, s55, 15 +; SI-NEXT: v_writelane_b32 v18, s64, 16 +; SI-NEXT: v_writelane_b32 v18, s65, 17 +; SI-NEXT: v_writelane_b32 v18, s66, 18 +; SI-NEXT: v_writelane_b32 v18, s67, 19 +; SI-NEXT: v_writelane_b32 v18, s68, 20 +; SI-NEXT: v_writelane_b32 v18, s69, 21 +; SI-NEXT: v_writelane_b32 v18, s70, 22 +; SI-NEXT: v_writelane_b32 v18, s71, 23 +; SI-NEXT: v_writelane_b32 v18, s80, 24 +; SI-NEXT: v_writelane_b32 v18, s81, 25 +; SI-NEXT: v_writelane_b32 v18, s82, 26 +; SI-NEXT: v_writelane_b32 v18, s83, 27 +; SI-NEXT: v_writelane_b32 v18, s84, 28 +; SI-NEXT: v_writelane_b32 v18, s85, 29 +; SI-NEXT: v_writelane_b32 v18, s86, 30 +; SI-NEXT: v_writelane_b32 v18, s87, 31 +; SI-NEXT: v_writelane_b32 v18, s96, 32 +; SI-NEXT: v_writelane_b32 v18, s97, 33 +; SI-NEXT: v_writelane_b32 v18, s98, 34 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s10 -; SI-NEXT: s_lshr_b32 s10, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 -; SI-NEXT: s_lshr_b32 s10, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 -; SI-NEXT: s_lshr_b32 s10, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: s_lshr_b32 s10, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s24 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v40, s30, 0 -; SI-NEXT: v_writelane_b32 v40, s31, 1 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 -; SI-NEXT: v_writelane_b32 v40, s34, 2 -; SI-NEXT: v_writelane_b32 v40, s35, 3 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: s_lshr_b32 s96, s29, 16 +; SI-NEXT: s_lshr_b32 s97, s28, 16 +; SI-NEXT: s_lshr_b32 s86, s27, 16 +; SI-NEXT: s_lshr_b32 s87, s26, 16 +; SI-NEXT: s_lshr_b32 s84, s25, 16 +; SI-NEXT: s_lshr_b32 s85, s24, 16 +; SI-NEXT: s_lshr_b32 s82, s23, 16 +; SI-NEXT: s_lshr_b32 s83, s22, 16 +; SI-NEXT: s_lshr_b32 s80, s21, 16 +; SI-NEXT: s_lshr_b32 s81, s20, 16 +; SI-NEXT: s_lshr_b32 s70, s19, 16 +; SI-NEXT: s_lshr_b32 s71, s18, 16 +; SI-NEXT: s_lshr_b32 s68, s17, 16 +; SI-NEXT: s_lshr_b32 s69, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: v_writelane_b32 v40, s36, 4 +; SI-NEXT: v_writelane_b32 v18, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s98, v2 +; SI-NEXT: v_readfirstlane_b32 s44, v1 +; SI-NEXT: v_readfirstlane_b32 s99, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_writelane_b32 v40, s37, 5 -; SI-NEXT: s_cbranch_scc0 .LBB105_4 +; SI-NEXT: v_readfirstlane_b32 s46, v5 +; SI-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane +; SI-NEXT: s_cbranch_scc0 .LBB105_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readfirstlane_b32 s4, v12 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v11 -; SI-NEXT: s_or_b32 s18, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v9 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v10 -; SI-NEXT: s_or_b32 s19, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v15 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v14 -; SI-NEXT: s_or_b32 s16, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v8 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v13 -; SI-NEXT: s_or_b32 s17, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v18 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v17 -; SI-NEXT: s_or_b32 s14, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v7 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v16 -; SI-NEXT: s_or_b32 s15, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v21 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v20 -; SI-NEXT: s_or_b32 s12, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v6 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v19 -; SI-NEXT: s_or_b32 s13, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v24 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v23 -; SI-NEXT: s_or_b32 s10, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v5 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v22 -; SI-NEXT: s_or_b32 s11, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v27 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v26 -; SI-NEXT: s_or_b32 s8, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v4 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v25 -; SI-NEXT: s_or_b32 s9, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v32 -; SI-NEXT: s_or_b32 s6, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v30 -; SI-NEXT: s_or_b32 s7, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v38 -; SI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s5, v1 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_readfirstlane_b32 s21, v36 -; SI-NEXT: s_lshr_b64 s[22:23], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[24:25], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[28:29], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 8 -; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 -; SI-NEXT: s_or_b32 s5, s21, s5 -; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[10:11], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[78:79], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[88:89], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[92:93], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[6:7], 8 -; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[36:37], s[4:5], 8 -; SI-NEXT: s_lshr_b32 s45, s19, 8 -; SI-NEXT: s_lshr_b32 s43, s17, 8 -; SI-NEXT: s_lshr_b32 s41, s15, 8 -; SI-NEXT: s_lshr_b32 s29, s13, 8 -; SI-NEXT: s_lshr_b32 s27, s11, 8 -; SI-NEXT: s_lshr_b32 s25, s9, 8 -; SI-NEXT: s_lshr_b32 s23, s7, 8 -; SI-NEXT: s_lshr_b32 s21, s5, 8 -; SI-NEXT: v_bfe_u32 v48, v9, 8, 8 -; SI-NEXT: v_bfe_u32 v37, v8, 8, 8 -; SI-NEXT: v_bfe_u32 v35, v7, 8, 8 -; SI-NEXT: v_bfe_u32 v33, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v31, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v29, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v28, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v3, v1, 8, 8 -; SI-NEXT: s_cbranch_execnz .LBB105_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s69, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s68, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v19, s4, 4 +; SI-NEXT: v_writelane_b32 v19, s5, 5 +; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 16 +; SI-NEXT: v_writelane_b32 v19, s4, 2 +; SI-NEXT: v_writelane_b32 v19, s5, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 8 +; SI-NEXT: v_writelane_b32 v19, s4, 0 +; SI-NEXT: v_writelane_b32 v19, s5, 1 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s71, 16 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s70, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 24 +; SI-NEXT: v_writelane_b32 v19, s4, 8 +; SI-NEXT: v_writelane_b32 v19, s5, 9 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 16 +; SI-NEXT: v_writelane_b32 v19, s4, 6 +; SI-NEXT: v_writelane_b32 v19, s5, 7 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s81, 16 +; SI-NEXT: s_or_b32 s14, s4, s5 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s80, 16 +; SI-NEXT: s_or_b32 s15, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s83, 16 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s82, 16 +; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s85, 16 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s84, 16 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s87, 16 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s86, 16 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s97, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s96, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s44, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s98, 0xffff +; SI-NEXT: s_lshl_b32 s45, s99, 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[90:91], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 +; SI-NEXT: s_or_b32 s5, s5, s45 +; SI-NEXT: s_lshr_b32 s75, s43, 8 +; SI-NEXT: s_lshr_b32 s73, s41, 8 +; SI-NEXT: s_lshr_b32 s63, s15, 8 +; SI-NEXT: s_lshr_b32 s61, s13, 8 +; SI-NEXT: s_lshr_b32 s59, s11, 8 +; SI-NEXT: s_lshr_b32 s57, s9, 8 +; SI-NEXT: s_lshr_b32 s47, s7, 8 +; SI-NEXT: s_lshr_b32 s45, s5, 8 +; SI-NEXT: s_bfe_u32 s77, s68, 0x80008 +; SI-NEXT: s_bfe_u32 s79, s70, 0x80008 +; SI-NEXT: s_bfe_u32 s89, s80, 0x80008 +; SI-NEXT: s_bfe_u32 s91, s82, 0x80008 +; SI-NEXT: s_bfe_u32 s93, s84, 0x80008 +; SI-NEXT: s_bfe_u32 s56, s86, 0x80008 +; SI-NEXT: s_bfe_u32 s58, s96, 0x80008 +; SI-NEXT: s_bfe_u32 s60, s99, 0x80008 +; SI-NEXT: s_lshr_b64 s[64:65], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[50:51], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[52:53], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[36:37], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[38:39], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[94:95], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 8 +; SI-NEXT: s_cbranch_execnz .LBB105_4 ; SI-NEXT: .LBB105_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s99 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_readfirstlane_b32 s4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 -; SI-NEXT: v_readfirstlane_b32 s5, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_readfirstlane_b32 s5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s98 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s97 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_readfirstlane_b32 s6, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 -; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s5, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_readfirstlane_b32 s7, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readfirstlane_b32 s6, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s96 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s29 +; SI-NEXT: v_readfirstlane_b32 s7, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s87 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s26 ; SI-NEXT: v_readfirstlane_b32 s7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_readfirstlane_b32 s9, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_readfirstlane_b32 s8, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_readfirstlane_b32 s10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s86 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 +; SI-NEXT: v_readfirstlane_b32 s9, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s85 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_readfirstlane_b32 s11, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_readfirstlane_b32 s9, v3 ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s24 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: v_readfirstlane_b32 s10, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_readfirstlane_b32 s12, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_readfirstlane_b32 s10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s84 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 ; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s25 +; SI-NEXT: v_readfirstlane_b32 s11, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s83 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_readfirstlane_b32 s11, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_readfirstlane_b32 s13, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_readfirstlane_b32 s12, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 +; SI-NEXT: v_readfirstlane_b32 s11, v4 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readfirstlane_b32 s12, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_readfirstlane_b32 s14, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s82 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 ; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s23 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s81 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readfirstlane_b32 s13, v6 -; SI-NEXT: v_readfirstlane_b32 s15, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_readfirstlane_b32 s13, v5 ; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 ; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: v_readfirstlane_b32 s14, v18 -; SI-NEXT: v_readfirstlane_b32 s16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s80 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 ; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s71 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_readfirstlane_b32 s15, v7 -; SI-NEXT: v_readfirstlane_b32 s17, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: v_readfirstlane_b32 s20, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 +; SI-NEXT: v_readfirstlane_b32 s18, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s70 +; SI-NEXT: v_readfirstlane_b32 s15, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: s_lshl_b32 s15, s15, 16 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: v_readfirstlane_b32 s16, v15 -; SI-NEXT: v_readfirstlane_b32 s18, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readfirstlane_b32 s19, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v9 -; SI-NEXT: v_readfirstlane_b32 s17, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v10 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readfirstlane_b32 s18, v12 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readfirstlane_b32 s19, v9 -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: v_readfirstlane_b32 s20, v3 -; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[22:23], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[24:25], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[28:29], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 8 -; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[10:11], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[78:79], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[88:89], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[92:93], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[6:7], 8 -; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[36:37], s[4:5], 8 -; SI-NEXT: s_lshr_b32 s45, s19, 8 -; SI-NEXT: s_lshr_b32 s43, s17, 8 -; SI-NEXT: s_lshr_b32 s41, s15, 8 -; SI-NEXT: s_lshr_b32 s29, s13, 8 -; SI-NEXT: s_lshr_b32 s27, s11, 8 -; SI-NEXT: s_lshr_b32 s25, s9, 8 -; SI-NEXT: s_lshr_b32 s23, s7, 8 -; SI-NEXT: s_lshr_b32 s21, s5, 8 -; SI-NEXT: v_bfe_u32 v48, v9, 8, 8 -; SI-NEXT: v_bfe_u32 v37, v8, 8, 8 -; SI-NEXT: v_bfe_u32 v35, v7, 8, 8 -; SI-NEXT: v_bfe_u32 v33, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v31, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v29, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v28, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v3, v1, 8, 8 -; SI-NEXT: .LBB105_3: ; %end -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s26, s26, 8 -; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_or_b32 s18, s18, s26 -; SI-NEXT: s_lshl_b32 s22, s22, 16 -; SI-NEXT: s_lshl_b32 s20, s20, 24 -; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: s_and_b32 s18, s19, 0xff -; SI-NEXT: s_lshl_b32 s19, s45, 8 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen -; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v48 -; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_or_b32_e32 v9, s18, v9 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s18, s42, 8 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: s_and_b32 s18, s28, 0xff +; SI-NEXT: s_or_b32 s15, s20, s15 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_readfirstlane_b32 s20, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s69 ; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s24, 24 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_or_b32 s40, s20, s18 +; SI-NEXT: v_readfirstlane_b32 s18, v9 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: v_readfirstlane_b32 s16, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v10 +; SI-NEXT: s_or_b32 s41, s16, s18 +; SI-NEXT: v_readfirstlane_b32 s16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s68 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_readfirstlane_b32 s17, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_or_b32 s42, s17, s16 +; SI-NEXT: v_readfirstlane_b32 s16, v12 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_readfirstlane_b32 s17, v7 +; SI-NEXT: s_or_b32 s43, s17, s16 +; SI-NEXT: s_lshr_b64 s[74:75], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[18:19], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[20:21], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[22:23], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[90:91], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[64:65], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[50:51], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[52:53], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[36:37], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[38:39], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[94:95], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s75, s43, 8 +; SI-NEXT: s_lshr_b32 s73, s41, 8 +; SI-NEXT: s_lshr_b32 s63, s15, 8 +; SI-NEXT: s_lshr_b32 s61, s13, 8 +; SI-NEXT: s_lshr_b32 s59, s11, 8 +; SI-NEXT: s_lshr_b32 s57, s9, 8 +; SI-NEXT: s_lshr_b32 s47, s7, 8 +; SI-NEXT: s_lshr_b32 s45, s5, 8 +; SI-NEXT: v_bfe_u32 v16, v12, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v9, 8, 8 +; SI-NEXT: v_bfe_u32 v14, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v13, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v11, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v10, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v8, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v7, v1, 8, 8 +; SI-NEXT: s_branch .LBB105_5 +; SI-NEXT: .LBB105_3: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v19, s4, 0 +; SI-NEXT: v_writelane_b32 v19, s5, 1 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: v_writelane_b32 v19, s4, 2 +; SI-NEXT: v_writelane_b32 v19, s5, 3 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v19, s4, 4 +; SI-NEXT: v_writelane_b32 v19, s5, 5 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v19, s4, 6 +; SI-NEXT: v_writelane_b32 v19, s5, 7 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v19, s4, 8 +; SI-NEXT: v_writelane_b32 v19, s5, 9 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_branch .LBB105_2 +; SI-NEXT: .LBB105_4: +; SI-NEXT: v_readlane_b32 s18, v19, 4 +; SI-NEXT: v_readlane_b32 s20, v19, 2 +; SI-NEXT: v_readlane_b32 s22, v19, 8 +; SI-NEXT: v_readlane_b32 s24, v19, 0 +; SI-NEXT: v_readlane_b32 s26, v19, 6 +; SI-NEXT: v_mov_b32_e32 v1, s99 +; SI-NEXT: v_mov_b32_e32 v2, s96 +; SI-NEXT: v_mov_b32_e32 v3, s86 +; SI-NEXT: v_mov_b32_e32 v4, s84 +; SI-NEXT: v_mov_b32_e32 v5, s82 +; SI-NEXT: v_mov_b32_e32 v6, s80 +; SI-NEXT: v_mov_b32_e32 v9, s70 +; SI-NEXT: v_mov_b32_e32 v12, s68 +; SI-NEXT: v_mov_b32_e32 v7, s60 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v10, s56 +; SI-NEXT: v_mov_b32_e32 v11, s93 +; SI-NEXT: v_mov_b32_e32 v13, s91 +; SI-NEXT: v_mov_b32_e32 v14, s89 +; SI-NEXT: v_mov_b32_e32 v15, s79 +; SI-NEXT: v_mov_b32_e32 v16, s77 +; SI-NEXT: v_readlane_b32 s19, v19, 5 +; SI-NEXT: v_readlane_b32 s21, v19, 3 +; SI-NEXT: v_readlane_b32 s23, v19, 9 +; SI-NEXT: v_readlane_b32 s25, v19, 1 +; SI-NEXT: v_readlane_b32 s27, v19, 7 +; SI-NEXT: .LBB105_5: ; %end +; SI-NEXT: s_and_b32 s16, s42, 0xff +; SI-NEXT: s_lshl_b32 s17, s24, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s20, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v17, s16 +; SI-NEXT: s_and_b32 s16, s43, 0xff +; SI-NEXT: s_lshl_b32 s17, s75, 8 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_or_b32_e32 v12, v16, v12 +; SI-NEXT: v_or_b32_e32 v12, s16, v12 +; SI-NEXT: s_and_b32 s16, s40, 0xff +; SI-NEXT: s_lshl_b32 s17, s74, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s26, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s22, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 4, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v12, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 -; SI-NEXT: v_mov_b32_e32 v10, s16 -; SI-NEXT: s_and_b32 s16, s17, 0xff -; SI-NEXT: s_lshl_b32 s17, s43, 8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 8, v0 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: s_and_b32 s16, s41, 0xff +; SI-NEXT: s_lshl_b32 s17, s73, 8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: buffer_store_dword v16, v12, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v37 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v15 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v8, s16, v8 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v9, s16, v9 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s16, s56, 8 +; SI-NEXT: s_lshl_b32 s16, s76, 8 ; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: s_and_b32 s16, s44, 0xff +; SI-NEXT: s_and_b32 s16, s72, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s40, 24 +; SI-NEXT: s_lshl_b32 s17, s62, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 12, v0 ; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v9, v12, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 -; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 +; SI-NEXT: v_mov_b32_e32 v12, s14 ; SI-NEXT: s_and_b32 s14, s15, 0xff -; SI-NEXT: s_lshl_b32 s15, s41, 8 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s15, s63, 8 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v35 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v14 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v7, s14, v7 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_or_b32_e32 v6, s14, v6 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s14, s62, 8 +; SI-NEXT: s_lshl_b32 s14, s92, 8 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: s_and_b32 s14, s58, 0xff +; SI-NEXT: s_and_b32 s14, s90, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s15, s46, 24 +; SI-NEXT: s_lshl_b32 s15, s78, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 -; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; SI-NEXT: v_mov_b32_e32 v9, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xff -; SI-NEXT: s_lshl_b32 s13, s29, 8 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s61, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: buffer_store_dword v9, v6, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v13 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v6, s12, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v5, s12, v5 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: s_lshl_b32 s12, s76, 8 +; SI-NEXT: s_lshl_b32 s12, s66, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: s_and_b32 s12, s72, 0xff +; SI-NEXT: s_and_b32 s12, s64, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s13, s60, 24 +; SI-NEXT: s_lshl_b32 s13, s88, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_add_i32_e32 v7, vcc, 28, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 -; SI-NEXT: v_mov_b32_e32 v7, s10 +; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 +; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_and_b32 s10, s11, 0xff -; SI-NEXT: s_lshl_b32 s11, s27, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s11, s59, 8 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v31 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v11 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v5, s10, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v4, s10, v4 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s10, s90, 8 +; SI-NEXT: s_lshl_b32 s10, s54, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: s_and_b32 s10, s78, 0xff +; SI-NEXT: s_and_b32 s10, s52, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s11, s74, 24 +; SI-NEXT: s_lshl_b32 s11, s50, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_add_i32_e32 v6, vcc, 36, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 -; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; SI-NEXT: v_mov_b32_e32 v5, s8 ; SI-NEXT: s_and_b32 s8, s9, 0xff -; SI-NEXT: s_lshl_b32 s9, s25, 8 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s9, s57, 8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v10 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v4, s8, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v3, s8, v3 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s8, s94, 8 +; SI-NEXT: s_lshl_b32 s8, s48, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: s_and_b32 s8, s92, 0xff +; SI-NEXT: s_and_b32 s8, s38, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s88, 24 +; SI-NEXT: s_lshl_b32 s9, s36, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 -; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_mov_b32_e32 v4, s6 ; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: s_lshl_b32 s7, s23, 8 +; SI-NEXT: s_lshl_b32 s7, s47, 8 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v2, s6, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s6, s36, 8 +; SI-NEXT: s_lshl_b32 s6, s34, 8 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s34, 0xff +; SI-NEXT: s_and_b32 s6, s30, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s30, 24 +; SI-NEXT: s_lshl_b32 s7, s94, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: s_and_b32 s4, s5, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_lshl_b32 s5, s45, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v7 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s37, v40, 5 -; SI-NEXT: v_readlane_b32 s36, v40, 4 -; SI-NEXT: v_readlane_b32 s35, v40, 3 -; SI-NEXT: v_readlane_b32 s34, v40, 2 -; SI-NEXT: v_readlane_b32 s31, v40, 1 -; SI-NEXT: v_readlane_b32 s30, v40, 0 -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v18, 35 +; SI-NEXT: v_readlane_b32 s98, v18, 34 +; SI-NEXT: v_readlane_b32 s97, v18, 33 +; SI-NEXT: v_readlane_b32 s96, v18, 32 +; SI-NEXT: v_readlane_b32 s87, v18, 31 +; SI-NEXT: v_readlane_b32 s86, v18, 30 +; SI-NEXT: v_readlane_b32 s85, v18, 29 +; SI-NEXT: v_readlane_b32 s84, v18, 28 +; SI-NEXT: v_readlane_b32 s83, v18, 27 +; SI-NEXT: v_readlane_b32 s82, v18, 26 +; SI-NEXT: v_readlane_b32 s81, v18, 25 +; SI-NEXT: v_readlane_b32 s80, v18, 24 +; SI-NEXT: v_readlane_b32 s71, v18, 23 +; SI-NEXT: v_readlane_b32 s70, v18, 22 +; SI-NEXT: v_readlane_b32 s69, v18, 21 +; SI-NEXT: v_readlane_b32 s68, v18, 20 +; SI-NEXT: v_readlane_b32 s67, v18, 19 +; SI-NEXT: v_readlane_b32 s66, v18, 18 +; SI-NEXT: v_readlane_b32 s65, v18, 17 +; SI-NEXT: v_readlane_b32 s64, v18, 16 +; SI-NEXT: v_readlane_b32 s55, v18, 15 +; SI-NEXT: v_readlane_b32 s54, v18, 14 +; SI-NEXT: v_readlane_b32 s53, v18, 13 +; SI-NEXT: v_readlane_b32 s52, v18, 12 +; SI-NEXT: v_readlane_b32 s51, v18, 11 +; SI-NEXT: v_readlane_b32 s50, v18, 10 +; SI-NEXT: v_readlane_b32 s49, v18, 9 +; SI-NEXT: v_readlane_b32 s48, v18, 8 +; SI-NEXT: v_readlane_b32 s39, v18, 7 +; SI-NEXT: v_readlane_b32 s38, v18, 6 +; SI-NEXT: v_readlane_b32 s37, v18, 5 +; SI-NEXT: v_readlane_b32 s36, v18, 4 +; SI-NEXT: v_readlane_b32 s35, v18, 3 +; SI-NEXT: v_readlane_b32 s34, v18, 2 +; SI-NEXT: v_readlane_b32 s31, v18, 1 +; SI-NEXT: v_readlane_b32 s30, v18, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB105_4: -; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr29 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr27 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr25 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr23 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr21 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: s_branch .LBB105_2 ; ; VI-LABEL: bitcast_v32f16_to_v64i8_scalar: ; VI: ; %bb.0: @@ -83996,349 +82888,376 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:128 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v25 -; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v25 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v4 -; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v8 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v10 -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v12 -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v14 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v16 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v18 -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v22 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v16 +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v20 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v32 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v28 -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v30 -; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v33 -; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v35 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v31 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v32 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v13 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v12, v12, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v39, v46, v9 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v29, v5, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v0, v29 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v39 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v5, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v36, v5, v0 +; SI-NEXT: v_alignbit_b32 v0, v23, v14, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v0, v36 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v17, v5, v0 +; SI-NEXT: v_alignbit_b32 v0, v3, v17, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v37, v7, v5 +; SI-NEXT: v_or_b32_e32 v5, v0, v37 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v19, v54, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v38, v42, v7 +; SI-NEXT: v_alignbit_b32 v53, v5, v19, 16 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v7, v0, v38 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v25, v44, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v45, v7, v25, 16 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v9, v0, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v20, v57, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v12, v59, v11 +; SI-NEXT: v_or_b32_e32 v11, v0, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v26, v63, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v13, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v4, v4, v8 +; SI-NEXT: v_or_b32_e32 v15, v0, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v8, v27, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v32, v9, v20, 16 +; SI-NEXT: v_alignbit_b32 v22, v11, v26, 16 +; SI-NEXT: v_alignbit_b32 v21, v13, v2, 16 +; SI-NEXT: v_alignbit_b32 v33, v15, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v14, v14, v18 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v61, v17, v19 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v47 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v43, v17, v25 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v25 -; SI-NEXT: v_or_b32_e32 v11, v11, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v29 -; SI-NEXT: v_or_b32_e32 v11, v11, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v50 -; SI-NEXT: v_or_b32_e32 v11, v11, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v51 -; SI-NEXT: v_or_b32_e32 v11, v11, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v54 -; SI-NEXT: v_or_b32_e32 v11, v11, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v47 -; SI-NEXT: v_or_b32_e32 v11, v11, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v3, v3, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v17 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v31 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v48, v17, v20 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v10, v17, v10 +; SI-NEXT: v_or_b32_e32 v17, v1, v8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -84352,325 +83271,352 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_or_b32_e32 v25, v10, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: .LBB106_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v3, v27, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v12, v9, v12 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v11 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v7, v7, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v2, v2, v6 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v8 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v47 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v3, v20, v3 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v14, v6 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v12 -; SI-NEXT: v_or_b32_e32 v1, v1, v11 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v50 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v10, v60, v10 -; SI-NEXT: v_or_b32_e32 v11, v57, v11 -; SI-NEXT: v_or_b32_e32 v12, v56, v12 -; SI-NEXT: v_or_b32_e32 v14, v44, v14 -; SI-NEXT: v_or_b32_e32 v15, v41, v15 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v19, v23, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v24, v38, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: v_or_b32_e32 v16, v40, v16 -; SI-NEXT: v_or_b32_e32 v17, v52, v17 -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v55, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v9 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v18, v27, v18 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 -; SI-NEXT: v_or_b32_e32 v23, v39, v23 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v19, v49, v19 -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v53, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v8 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_alignbit_b32 v21, v13, v19, 16 +; SI-NEXT: v_alignbit_b32 v33, v15, v17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v19, v48, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_alignbit_b32 v22, v11, v25, 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v48, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v46, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v32, v9, v48, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v43, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v45, v7, v43, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v61, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: .LBB106_4: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v53 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v53, v5, v61, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v1 +; SI-NEXT: v_alignbit_b32 v1, v23, v0, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v3, v14, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: .LBB106_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v53 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v45 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v45 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v55 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v61 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v46 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v62 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload @@ -84687,26 +83633,26 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v30 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v31 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v13, v18, v13 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -86421,546 +85367,783 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; SI-LABEL: bitcast_v64i8_to_v32f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: v_readfirstlane_b32 s46, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v32, s30, 0 -; SI-NEXT: v_writelane_b32 v32, s31, 1 -; SI-NEXT: v_writelane_b32 v32, s34, 2 -; SI-NEXT: v_writelane_b32 v32, s35, 3 -; SI-NEXT: v_writelane_b32 v32, s36, 4 -; SI-NEXT: v_writelane_b32 v32, s37, 5 -; SI-NEXT: v_writelane_b32 v32, s38, 6 -; SI-NEXT: v_writelane_b32 v32, s39, 7 -; SI-NEXT: v_readfirstlane_b32 s74, v30 -; SI-NEXT: v_readfirstlane_b32 s61, v29 -; SI-NEXT: v_readfirstlane_b32 s63, v28 -; SI-NEXT: v_readfirstlane_b32 s59, v27 -; SI-NEXT: v_readfirstlane_b32 s60, v26 -; SI-NEXT: v_readfirstlane_b32 s57, v25 -; SI-NEXT: v_readfirstlane_b32 s58, v24 -; SI-NEXT: v_readfirstlane_b32 s47, v23 -; SI-NEXT: v_readfirstlane_b32 s56, v22 -; SI-NEXT: v_readfirstlane_b32 s44, v21 -; SI-NEXT: v_readfirstlane_b32 s34, v19 -; SI-NEXT: v_readfirstlane_b32 s37, v18 -; SI-NEXT: v_readfirstlane_b32 s94, v17 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v40, s30, 0 +; SI-NEXT: v_writelane_b32 v40, s31, 1 +; SI-NEXT: v_writelane_b32 v40, s34, 2 +; SI-NEXT: v_writelane_b32 v40, s35, 3 +; SI-NEXT: v_writelane_b32 v40, s36, 4 +; SI-NEXT: v_writelane_b32 v40, s37, 5 +; SI-NEXT: v_writelane_b32 v40, s38, 6 +; SI-NEXT: v_writelane_b32 v40, s39, 7 +; SI-NEXT: v_writelane_b32 v40, s48, 8 +; SI-NEXT: v_writelane_b32 v40, s49, 9 +; SI-NEXT: v_writelane_b32 v40, s50, 10 +; SI-NEXT: v_writelane_b32 v40, s51, 11 +; SI-NEXT: v_writelane_b32 v40, s52, 12 +; SI-NEXT: v_writelane_b32 v40, s53, 13 +; SI-NEXT: v_writelane_b32 v40, s54, 14 +; SI-NEXT: v_writelane_b32 v40, s55, 15 +; SI-NEXT: s_mov_b32 s92, s16 +; SI-NEXT: v_writelane_b32 v40, s64, 16 +; SI-NEXT: v_writelane_b32 v40, s65, 17 +; SI-NEXT: v_writelane_b32 v40, s66, 18 +; SI-NEXT: v_writelane_b32 v40, s67, 19 +; SI-NEXT: v_writelane_b32 v40, s68, 20 +; SI-NEXT: v_writelane_b32 v40, s69, 21 +; SI-NEXT: v_writelane_b32 v40, s70, 22 +; SI-NEXT: v_writelane_b32 v40, s71, 23 +; SI-NEXT: v_writelane_b32 v40, s80, 24 +; SI-NEXT: v_writelane_b32 v40, s81, 25 +; SI-NEXT: v_writelane_b32 v40, s82, 26 +; SI-NEXT: v_writelane_b32 v40, s83, 27 +; SI-NEXT: v_writelane_b32 v40, s84, 28 +; SI-NEXT: v_writelane_b32 v40, s85, 29 +; SI-NEXT: v_writelane_b32 v40, s86, 30 +; SI-NEXT: v_writelane_b32 v40, s87, 31 +; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v40, s96, 32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s23, 0 +; SI-NEXT: v_writelane_b32 v40, s97, 33 +; SI-NEXT: v_writelane_b32 v41, s21, 1 +; SI-NEXT: v_readfirstlane_b32 s47, v29 +; SI-NEXT: v_writelane_b32 v40, s98, 34 +; SI-NEXT: v_writelane_b32 v41, s47, 2 +; SI-NEXT: v_writelane_b32 v40, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s82, v30 +; SI-NEXT: v_readfirstlane_b32 s83, v28 +; SI-NEXT: v_readfirstlane_b32 s44, v27 +; SI-NEXT: v_readfirstlane_b32 s96, v26 +; SI-NEXT: v_readfirstlane_b32 s70, v25 +; SI-NEXT: v_readfirstlane_b32 s68, v24 +; SI-NEXT: v_readfirstlane_b32 s84, v23 +; SI-NEXT: v_readfirstlane_b32 s65, v22 +; SI-NEXT: v_readfirstlane_b32 s86, v21 +; SI-NEXT: v_readfirstlane_b32 s66, v20 +; SI-NEXT: v_readfirstlane_b32 s87, v19 +; SI-NEXT: v_readfirstlane_b32 s80, v18 +; SI-NEXT: v_readfirstlane_b32 s36, v17 ; SI-NEXT: v_readfirstlane_b32 s31, v16 -; SI-NEXT: v_readfirstlane_b32 s90, v15 -; SI-NEXT: v_readfirstlane_b32 s93, v14 -; SI-NEXT: v_readfirstlane_b32 s79, v13 -; SI-NEXT: v_readfirstlane_b32 s39, v12 -; SI-NEXT: v_readfirstlane_b32 s36, v11 -; SI-NEXT: v_readfirstlane_b32 s38, v10 -; SI-NEXT: v_readfirstlane_b32 s30, v9 +; SI-NEXT: v_readfirstlane_b32 s64, v15 +; SI-NEXT: v_readfirstlane_b32 s38, v14 +; SI-NEXT: v_readfirstlane_b32 s67, v13 +; SI-NEXT: v_readfirstlane_b32 s34, v12 +; SI-NEXT: v_readfirstlane_b32 s71, v11 +; SI-NEXT: v_readfirstlane_b32 s81, v10 +; SI-NEXT: v_readfirstlane_b32 s37, v9 ; SI-NEXT: v_readfirstlane_b32 s35, v8 -; SI-NEXT: v_readfirstlane_b32 s92, v7 -; SI-NEXT: v_readfirstlane_b32 s95, v6 -; SI-NEXT: v_readfirstlane_b32 s89, v5 -; SI-NEXT: v_readfirstlane_b32 s91, v4 -; SI-NEXT: v_readfirstlane_b32 s78, v3 -; SI-NEXT: v_readfirstlane_b32 s88, v2 -; SI-NEXT: v_readfirstlane_b32 s76, v1 -; SI-NEXT: v_readfirstlane_b32 s77, v0 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s6, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s9, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s7, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s11, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s8, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s12, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s10, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s15, v31 +; SI-NEXT: v_readfirstlane_b32 s49, v7 +; SI-NEXT: v_readfirstlane_b32 s94, v6 +; SI-NEXT: v_readfirstlane_b32 s51, v5 +; SI-NEXT: v_readfirstlane_b32 s88, v4 +; SI-NEXT: v_readfirstlane_b32 s53, v3 +; SI-NEXT: v_readfirstlane_b32 s54, v2 +; SI-NEXT: v_readfirstlane_b32 s89, v1 +; SI-NEXT: v_readfirstlane_b32 s90, v0 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s91, v31 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s16, v32 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s93, v33 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s52, v34 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s55, v35 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s79, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 +; SI-NEXT: v_readfirstlane_b32 s50, v36 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s21, v38 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s56, v31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s85, v32 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s58, v33 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s98, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s46, v48 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s99, v49 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s97, v50 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s9, v51 +; SI-NEXT: v_writelane_b32 v41, s58, 3 +; SI-NEXT: v_writelane_b32 v41, s9, 4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_readfirstlane_b32 s78, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s69, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s13, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s41, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s14, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s43, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s40, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s45, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s42, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s73, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s62, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s75, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s72, v31 -; SI-NEXT: s_cbranch_scc0 .LBB107_4 +; SI-NEXT: v_readfirstlane_b32 s30, v36 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB107_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_and_b32 s4, s92, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_or_b32 s12, s4, s5 ; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s19, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s23, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s5, s27, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s77, 0xff -; SI-NEXT: s_lshl_b32 s5, s76, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s88, 0xff -; SI-NEXT: s_lshl_b32 s5, s78, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s91, 0xff -; SI-NEXT: s_lshl_b32 s5, s89, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s95, 0xff -; SI-NEXT: s_lshl_b32 s5, s92, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s35, 0xff -; SI-NEXT: s_lshl_b32 s5, s30, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s38, 0xff -; SI-NEXT: s_lshl_b32 s5, s36, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s39, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: s_lshl_b32 s6, s25, 8 +; SI-NEXT: s_or_b32 s13, s5, s6 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s27, 24 +; SI-NEXT: s_or_b32 s6, s6, s5 +; SI-NEXT: s_and_b32 s5, s54, 0xff +; SI-NEXT: s_lshl_b32 s7, s53, 8 +; SI-NEXT: s_or_b32 s14, s5, s7 +; SI-NEXT: s_and_b32 s5, s88, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s51, 24 +; SI-NEXT: s_or_b32 s8, s7, s5 +; SI-NEXT: s_and_b32 s5, s81, 0xff +; SI-NEXT: s_lshl_b32 s7, s71, 8 +; SI-NEXT: s_or_b32 s15, s5, s7 +; SI-NEXT: s_and_b32 s5, s34, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s67, 24 +; SI-NEXT: s_or_b32 s10, s7, s5 +; SI-NEXT: s_and_b32 s5, s80, 0xff +; SI-NEXT: s_lshl_b32 s7, s87, 8 +; SI-NEXT: s_or_b32 s40, s5, s7 +; SI-NEXT: s_and_b32 s5, s66, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s86, 24 +; SI-NEXT: s_or_b32 s60, s7, s5 +; SI-NEXT: s_and_b32 s5, s96, 0xff +; SI-NEXT: s_lshl_b32 s7, s44, 8 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s9, 0xff +; SI-NEXT: s_lshl_b32 s7, s97, 8 +; SI-NEXT: s_or_b32 s42, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xff +; SI-NEXT: s_lshl_b32 s7, s79, 8 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: v_readlane_b32 s7, v41, 1 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: v_readlane_b32 s9, v41, 0 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: s_or_b32 s57, s9, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s9, s29, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s90, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s89, 24 +; SI-NEXT: s_or_b32 s77, s11, s9 +; SI-NEXT: s_and_b32 s9, s94, 0xff +; SI-NEXT: s_lshl_b32 s11, s49, 8 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s11, s35, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: v_writelane_b32 v41, s44, 11 +; SI-NEXT: s_lshl_b32 s44, s37, 24 +; SI-NEXT: s_or_b32 vcc_lo, s44, s11 +; SI-NEXT: s_and_b32 s11, s38, 0xff +; SI-NEXT: s_lshl_b32 s44, s64, 8 +; SI-NEXT: s_or_b32 s11, s11, s44 +; SI-NEXT: s_and_b32 s44, s31, 0xff +; SI-NEXT: s_lshl_b32 s44, s44, 16 +; SI-NEXT: s_lshl_b32 s45, s36, 24 +; SI-NEXT: s_or_b32 vcc_hi, s45, s44 +; SI-NEXT: s_and_b32 s44, s65, 0xff +; SI-NEXT: s_lshl_b32 s45, s84, 8 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s68, 0xff +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_mov_b32 s23, s21 +; SI-NEXT: s_mov_b32 s21, s46 +; SI-NEXT: s_lshl_b32 s46, s70, 24 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: v_writelane_b32 v41, s97, 12 +; SI-NEXT: s_mov_b32 s97, s86 +; SI-NEXT: s_mov_b32 s86, s84 +; SI-NEXT: s_mov_b32 s84, s70 +; SI-NEXT: s_mov_b32 s70, s34 +; SI-NEXT: s_mov_b32 s34, s88 +; SI-NEXT: s_mov_b32 s88, s24 +; SI-NEXT: s_or_b32 s24, s46, s45 +; SI-NEXT: s_or_b32 s61, s44, s24 +; SI-NEXT: s_and_b32 s44, s82, 0xff +; SI-NEXT: s_lshl_b32 s45, s30, 8 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s69, 0xff +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s78, 24 +; SI-NEXT: s_mov_b32 s95, s90 +; SI-NEXT: s_mov_b32 s90, s18 +; SI-NEXT: s_or_b32 s18, s46, s45 +; SI-NEXT: s_and_b32 s45, s83, 0xff +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s47, 24 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_or_b32 s62, s46, s45 +; SI-NEXT: s_or_b32 s63, s44, s18 +; SI-NEXT: s_and_b32 s44, s98, 0xff +; SI-NEXT: s_lshl_b32 s45, s58, 8 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s85, 0xff +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s56, 24 +; SI-NEXT: s_mov_b32 s76, s56 +; SI-NEXT: s_mov_b32 s56, s85 +; SI-NEXT: s_mov_b32 s85, s79 +; SI-NEXT: s_mov_b32 s79, s19 +; SI-NEXT: s_or_b32 s19, s46, s45 +; SI-NEXT: s_and_b32 s45, s99, 0xff +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s21, 24 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_or_b32 s72, s46, s45 +; SI-NEXT: s_or_b32 s73, s44, s19 +; SI-NEXT: s_and_b32 s44, s52, 0xff +; SI-NEXT: s_lshl_b32 s45, s93, 8 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s16, 0xff +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s91, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_mov_b32 s47, s96 +; SI-NEXT: s_mov_b32 s96, s78 +; SI-NEXT: s_mov_b32 s78, s69 +; SI-NEXT: s_mov_b32 s69, s68 +; SI-NEXT: s_mov_b32 s68, s38 +; SI-NEXT: s_mov_b32 s38, s35 +; SI-NEXT: s_mov_b32 s35, s89 +; SI-NEXT: s_or_b32 s89, s46, s45 +; SI-NEXT: s_and_b32 s45, s50, 0xff +; SI-NEXT: s_or_b32 s5, s5, s57 +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_lshl_b32 s46, s55, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_or_b32 s74, s46, s45 +; SI-NEXT: s_mov_b32 s45, s83 +; SI-NEXT: s_mov_b32 s83, s91 +; SI-NEXT: s_mov_b32 s91, s28 +; SI-NEXT: s_and_b32 s28, s42, 0xffff +; SI-NEXT: s_mov_b32 s59, s94 +; SI-NEXT: s_mov_b32 s94, s27 +; SI-NEXT: s_and_b32 s27, s43, 0xffff +; SI-NEXT: s_or_b32 s42, s12, s4 +; SI-NEXT: s_mov_b32 s43, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; SI-NEXT: s_or_b32 s9, s9, vcc_lo +; SI-NEXT: v_writelane_b32 v41, s4, 5 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: v_writelane_b32 v41, s5, 6 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 16 +; SI-NEXT: s_or_b32 s11, s11, vcc_hi +; SI-NEXT: v_writelane_b32 v41, s4, 7 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: v_writelane_b32 v41, s5, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 16 +; SI-NEXT: s_or_b32 s7, s7, s77 +; SI-NEXT: s_or_b32 s75, s44, s89 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_and_b32 s58, s15, 0xffff +; SI-NEXT: s_mov_b32 s44, s82 +; SI-NEXT: s_mov_b32 s82, s81 +; SI-NEXT: s_mov_b32 s81, s55 +; SI-NEXT: s_mov_b32 s55, s54 +; SI-NEXT: s_mov_b32 s54, s51 +; SI-NEXT: s_mov_b32 s51, s37 +; SI-NEXT: s_mov_b32 s37, s16 +; SI-NEXT: s_and_b32 s16, s40, 0xffff +; SI-NEXT: s_mov_b32 s46, s98 +; SI-NEXT: s_mov_b32 s98, s93 +; SI-NEXT: s_and_b32 s93, s41, 0xffff +; SI-NEXT: v_writelane_b32 v41, s4, 9 +; SI-NEXT: s_mov_b32 s39, s49 +; SI-NEXT: s_or_b32 s40, s13, s6 +; SI-NEXT: s_mov_b32 s41, s7 +; SI-NEXT: s_lshr_b64 s[48:49], s[6:7], 16 +; SI-NEXT: s_or_b32 s14, s14, s8 +; SI-NEXT: s_mov_b32 s15, s9 +; SI-NEXT: s_or_b32 s12, s58, s10 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: v_writelane_b32 v41, s5, 10 +; SI-NEXT: s_or_b32 s10, s16, s60 +; SI-NEXT: s_mov_b32 s11, s61 +; SI-NEXT: s_lshr_b64 s[60:61], s[60:61], 16 +; SI-NEXT: s_or_b32 s8, s93, s62 +; SI-NEXT: s_mov_b32 s9, s63 +; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 16 +; SI-NEXT: s_or_b32 s6, s28, s72 +; SI-NEXT: s_mov_b32 s7, s73 +; SI-NEXT: s_lshr_b64 s[72:73], s[72:73], 16 +; SI-NEXT: s_or_b32 s4, s27, s74 +; SI-NEXT: s_mov_b32 s5, s75 +; SI-NEXT: s_lshr_b64 s[74:75], s[74:75], 16 +; SI-NEXT: s_mov_b32 s16, s37 +; SI-NEXT: s_mov_b32 s37, s51 +; SI-NEXT: s_mov_b32 s51, s54 +; SI-NEXT: s_mov_b32 s54, s55 +; SI-NEXT: s_mov_b32 s55, s81 +; SI-NEXT: s_mov_b32 s81, s82 +; SI-NEXT: s_mov_b32 s82, s44 +; SI-NEXT: v_readlane_b32 s44, v41, 11 +; SI-NEXT: s_mov_b32 s93, s98 +; SI-NEXT: s_mov_b32 s98, s46 +; SI-NEXT: s_mov_b32 s46, s21 +; SI-NEXT: s_mov_b32 s21, s23 +; SI-NEXT: s_mov_b32 s28, s91 +; SI-NEXT: s_mov_b32 s91, s83 +; SI-NEXT: s_mov_b32 s83, s45 +; SI-NEXT: s_mov_b32 s27, s94 +; SI-NEXT: s_mov_b32 s94, s59 +; SI-NEXT: s_lshr_b32 s23, s57, 16 +; SI-NEXT: s_lshr_b32 s57, s77, 16 +; SI-NEXT: s_lshr_b32 s59, vcc_lo, 16 +; SI-NEXT: s_lshr_b32 s61, vcc_hi, 16 +; SI-NEXT: s_lshr_b32 s63, s24, 16 +; SI-NEXT: s_mov_b32 s24, s88 +; SI-NEXT: s_mov_b32 s88, s34 +; SI-NEXT: s_mov_b32 s34, s70 +; SI-NEXT: s_mov_b32 s70, s84 +; SI-NEXT: s_mov_b32 s84, s86 +; SI-NEXT: s_mov_b32 s86, s97 +; SI-NEXT: v_readlane_b32 s97, v41, 12 +; SI-NEXT: s_lshr_b32 s73, s18, 16 +; SI-NEXT: s_mov_b32 s18, s90 +; SI-NEXT: s_mov_b32 s90, s95 +; SI-NEXT: s_mov_b32 s49, s39 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_mov_b32 s19, s79 +; SI-NEXT: s_mov_b32 s79, s85 +; SI-NEXT: s_mov_b32 s85, s56 +; SI-NEXT: s_mov_b32 s56, s76 +; SI-NEXT: s_lshr_b32 s45, s89, 16 +; SI-NEXT: s_mov_b32 s89, s35 +; SI-NEXT: s_mov_b32 s35, s38 +; SI-NEXT: s_mov_b32 s38, s68 +; SI-NEXT: s_mov_b32 s68, s69 +; SI-NEXT: s_mov_b32 s69, s78 +; SI-NEXT: s_mov_b32 s78, s96 +; SI-NEXT: s_mov_b32 s96, s47 +; SI-NEXT: s_mov_b64 s[76:77], 0 +; SI-NEXT: s_branch .LBB107_3 +; SI-NEXT: .LBB107_2: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_mov_b64 s[76:77], -1 +; SI-NEXT: v_writelane_b32 v41, s4, 5 +; SI-NEXT: v_writelane_b32 v41, s5, 6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: v_writelane_b32 v41, s4, 7 +; SI-NEXT: v_writelane_b32 v41, s5, 8 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 9 +; SI-NEXT: v_writelane_b32 v41, s5, 10 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: .LBB107_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[76:77] +; SI-NEXT: v_readlane_b32 s76, v41, 5 +; SI-NEXT: v_readlane_b32 s77, v41, 6 +; SI-NEXT: s_mov_b32 s58, s76 +; SI-NEXT: v_readlane_b32 s76, v41, 7 +; SI-NEXT: v_readlane_b32 s77, v41, 8 +; SI-NEXT: s_cbranch_vccnz .LBB107_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_and_b32 s4, s21, 0xff ; SI-NEXT: s_lshl_b32 s5, s79, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s93, 0xff -; SI-NEXT: s_lshl_b32 s5, s90, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s31, 0xff -; SI-NEXT: s_lshl_b32 s5, s94, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_and_b32 s4, s37, 0xff -; SI-NEXT: s_lshl_b32 s5, s34, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_and_b32 s4, s46, 0xff -; SI-NEXT: s_lshl_b32 s5, s44, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_and_b32 s4, s56, 0xff -; SI-NEXT: s_lshl_b32 s5, s47, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_and_b32 s4, s58, 0xff -; SI-NEXT: s_lshl_b32 s5, s57, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_and_b32 s4, s60, 0xff -; SI-NEXT: s_lshl_b32 s5, s59, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_and_b32 s4, s63, 0xff -; SI-NEXT: s_lshl_b32 s5, s61, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_and_b32 s4, s74, 0xff -; SI-NEXT: s_lshl_b32 s5, s72, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_and_b32 s4, s75, 0xff -; SI-NEXT: s_lshl_b32 s5, s62, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_and_b32 s4, s73, 0xff -; SI-NEXT: s_lshl_b32 s5, s42, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: s_lshl_b32 s5, s40, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s14, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xff -; SI-NEXT: s_lshl_b32 s5, s13, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xff -; SI-NEXT: s_lshl_b32 s5, s10, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_and_b32 s4, s12, 0xff -; SI-NEXT: s_lshl_b32 s5, s8, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s5, s7, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s5, s6, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_cbranch_execnz .LBB107_3 -; SI-NEXT: .LBB107_2: ; %cmp.true -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s5, s6, 8 -; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s50, s50, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s11, 0xff -; SI-NEXT: s_lshl_b32 s6, s7, 8 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_and_b32 s6, s50, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s55, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s39, s52, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s39, 0xff +; SI-NEXT: s_lshl_b32 s6, s93, 8 +; SI-NEXT: s_add_i32 s79, s16, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s12, 0xff -; SI-NEXT: s_lshl_b32 s7, s8, 8 -; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_and_b32 s7, s79, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s91, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_readlane_b32 s6, v41, 4 +; SI-NEXT: s_add_i32 s23, s6, 3 +; SI-NEXT: s_and_b32 s6, s23, 0xff +; SI-NEXT: s_lshl_b32 s7, s97, 8 +; SI-NEXT: s_add_i32 s99, s99, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s15, 0xff -; SI-NEXT: s_lshl_b32 s8, s10, 8 -; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_and_b32 s8, s99, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s46, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s98, s98, 3 +; SI-NEXT: v_readlane_b32 s8, v41, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s98, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_add_i32 s85, s85, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s9, s85, 0xff +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_lshl_b32 s8, s56, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_add_i32 s96, s96, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s41, 0xff -; SI-NEXT: s_lshl_b32 s9, s13, 8 -; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_and_b32 s8, s96, 0xff +; SI-NEXT: s_lshl_b32 s9, s44, 8 +; SI-NEXT: s_add_i32 s83, s83, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s43, 0xff -; SI-NEXT: s_lshl_b32 s10, s14, 8 -; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: v_readlane_b32 s9, v41, 2 +; SI-NEXT: s_and_b32 s10, s83, 0xff +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_add_i32 s82, s82, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s82, 0xff +; SI-NEXT: s_lshl_b32 s10, s30, 8 +; SI-NEXT: s_add_i32 s69, s69, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s45, 0xff -; SI-NEXT: s_lshl_b32 s11, s40, 8 -; SI-NEXT: s_add_i32 s73, s73, 3 +; SI-NEXT: s_and_b32 s11, s69, 0xff +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_lshl_b32 s10, s78, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_add_i32 s80, s80, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s80, 0xff +; SI-NEXT: s_lshl_b32 s11, s87, 8 +; SI-NEXT: s_add_i32 s66, s66, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_and_b32 s11, s73, 0xff -; SI-NEXT: s_lshl_b32 s12, s42, 8 -; SI-NEXT: s_add_i32 s75, s75, 3 +; SI-NEXT: s_and_b32 s12, s66, 0xff +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_lshl_b32 s11, s86, 24 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_add_i32 s65, s65, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s65, 0xff +; SI-NEXT: s_lshl_b32 s12, s84, 8 +; SI-NEXT: s_add_i32 s52, s68, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: s_and_b32 s12, s75, 0xff -; SI-NEXT: s_lshl_b32 s13, s62, 8 -; SI-NEXT: s_add_i32 s74, s74, 3 +; SI-NEXT: s_and_b32 s13, s52, 0xff +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_lshl_b32 s12, s70, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_add_i32 s55, s81, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s55, 0xff +; SI-NEXT: s_lshl_b32 s13, s71, 8 +; SI-NEXT: s_add_i32 s48, s34, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: s_and_b32 s13, s74, 0xff -; SI-NEXT: s_lshl_b32 s14, s72, 8 -; SI-NEXT: s_add_i32 s63, s63, 3 +; SI-NEXT: s_and_b32 s14, s48, 0xff +; SI-NEXT: s_addk_i32 s12, 0x300 +; SI-NEXT: s_lshl_b32 s13, s67, 24 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_add_i32 s38, s38, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s38, 0xff +; SI-NEXT: s_lshl_b32 s14, s64, 8 +; SI-NEXT: s_add_i32 s31, s31, 3 ; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: s_and_b32 s14, s63, 0xff -; SI-NEXT: s_lshl_b32 s15, s61, 8 -; SI-NEXT: s_add_i32 s60, s60, 3 +; SI-NEXT: s_and_b32 s15, s31, 0xff +; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_lshl_b32 s14, s36, 24 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_add_i32 s36, s54, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s36, 0xff +; SI-NEXT: s_lshl_b32 s15, s53, 8 +; SI-NEXT: s_add_i32 s95, s88, 3 ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: s_and_b32 s15, s60, 0xff -; SI-NEXT: s_lshl_b32 s40, s59, 8 -; SI-NEXT: s_add_i32 s58, s58, 3 -; SI-NEXT: s_or_b32 s15, s40, s15 -; SI-NEXT: s_and_b32 s40, s58, 0xff -; SI-NEXT: s_lshl_b32 s41, s57, 8 -; SI-NEXT: s_add_i32 s56, s56, 3 -; SI-NEXT: s_or_b32 s40, s41, s40 -; SI-NEXT: s_and_b32 s41, s56, 0xff -; SI-NEXT: s_lshl_b32 s42, s47, 8 -; SI-NEXT: s_add_i32 s46, s46, 3 -; SI-NEXT: s_or_b32 s41, s42, s41 -; SI-NEXT: s_and_b32 s42, s46, 0xff -; SI-NEXT: s_lshl_b32 s43, s44, 8 -; SI-NEXT: s_add_i32 s37, s37, 3 -; SI-NEXT: s_or_b32 s42, s43, s42 -; SI-NEXT: s_and_b32 s43, s37, 0xff -; SI-NEXT: s_lshl_b32 s44, s34, 8 -; SI-NEXT: s_add_i32 s31, s31, 3 -; SI-NEXT: s_or_b32 s43, s44, s43 -; SI-NEXT: s_and_b32 s44, s31, 0xff -; SI-NEXT: s_lshl_b32 s45, s94, 8 -; SI-NEXT: s_add_i32 s93, s93, 3 -; SI-NEXT: s_or_b32 s44, s45, s44 -; SI-NEXT: s_and_b32 s45, s93, 0xff -; SI-NEXT: s_lshl_b32 s46, s90, 8 -; SI-NEXT: s_add_i32 s39, s39, 3 -; SI-NEXT: s_or_b32 s45, s46, s45 -; SI-NEXT: s_and_b32 s46, s39, 0xff -; SI-NEXT: s_lshl_b32 s47, s79, 8 -; SI-NEXT: s_add_i32 s38, s38, 3 -; SI-NEXT: s_or_b32 s46, s47, s46 -; SI-NEXT: s_and_b32 s47, s38, 0xff -; SI-NEXT: s_lshl_b32 s56, s36, 8 -; SI-NEXT: s_add_i32 s35, s35, 3 -; SI-NEXT: s_or_b32 s47, s56, s47 -; SI-NEXT: s_and_b32 s56, s35, 0xff -; SI-NEXT: s_lshl_b32 s57, s30, 8 -; SI-NEXT: s_add_i32 s95, s95, 3 -; SI-NEXT: s_or_b32 s56, s57, s56 -; SI-NEXT: s_and_b32 s57, s95, 0xff -; SI-NEXT: s_lshl_b32 s58, s92, 8 -; SI-NEXT: s_add_i32 s91, s91, 3 -; SI-NEXT: s_or_b32 s57, s58, s57 -; SI-NEXT: s_and_b32 s58, s91, 0xff -; SI-NEXT: s_lshl_b32 s59, s89, 8 -; SI-NEXT: s_add_i32 s88, s88, 3 -; SI-NEXT: s_or_b32 s58, s59, s58 -; SI-NEXT: s_and_b32 s59, s88, 0xff -; SI-NEXT: s_lshl_b32 s60, s78, 8 -; SI-NEXT: s_add_i32 s77, s77, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s21, s95, 0xff +; SI-NEXT: s_addk_i32 s14, 0x300 +; SI-NEXT: s_lshl_b32 s15, s51, 24 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s15, s15, s21 +; SI-NEXT: s_add_i32 s94, s94, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s94, 0xff +; SI-NEXT: s_lshl_b32 s21, s49, 8 +; SI-NEXT: s_add_i32 s91, s35, 3 +; SI-NEXT: s_or_b32 s15, s21, s15 +; SI-NEXT: s_and_b32 s16, s91, 0xff +; SI-NEXT: s_addk_i32 s15, 0x300 +; SI-NEXT: s_lshl_b32 s21, s37, 24 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_or_b32 s21, s21, s16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s15, s21, s15 +; SI-NEXT: s_and_b32 s21, s24, 0xff +; SI-NEXT: s_lshl_b32 s16, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s21, s16, s21 +; SI-NEXT: s_and_b32 s23, s26, 0xff +; SI-NEXT: s_addk_i32 s21, 0x300 +; SI-NEXT: s_lshl_b32 s16, s27, 24 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s23 +; SI-NEXT: s_or_b32 s21, s16, s21 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s40, s21, 0x3000000 +; SI-NEXT: s_and_b32 s21, s28, 0xff +; SI-NEXT: s_lshl_b32 s16, s29, 8 +; SI-NEXT: s_lshl_b32 s23, s89, 24 +; SI-NEXT: s_add_i32 s89, s90, 3 +; SI-NEXT: s_or_b32 s21, s16, s21 +; SI-NEXT: s_and_b32 s16, s89, 0xff +; SI-NEXT: s_addk_i32 s21, 0x300 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s16, s23, s16 +; SI-NEXT: s_or_b32 s16, s16, s21 +; SI-NEXT: s_add_i32 s41, s16, 0x3000000 +; SI-NEXT: s_add_i32 s16, s92, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_or_b32 s59, s60, s59 -; SI-NEXT: s_and_b32 s60, s77, 0xff -; SI-NEXT: s_lshl_b32 s61, s76, 8 -; SI-NEXT: s_and_b32 s28, s28, 0xff -; SI-NEXT: s_lshl_b32 s29, s29, 8 -; SI-NEXT: s_and_b32 s26, s26, 0xff -; SI-NEXT: s_lshl_b32 s27, s27, 8 -; SI-NEXT: s_and_b32 s24, s24, 0xff -; SI-NEXT: s_lshl_b32 s25, s25, 8 -; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_lshl_b32 s23, s23, 8 -; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s21, s21, 8 +; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 8 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s19, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s42, s16, 0x3000000 +; SI-NEXT: s_add_i32 s16, s20, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 1 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_or_b32 s60, s61, s60 -; SI-NEXT: s_or_b32 s28, s29, s28 -; SI-NEXT: s_or_b32 s26, s27, s26 -; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: s_or_b32 s22, s23, s22 -; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_add_i32 s18, s22, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: s_addk_i32 s8, 0x300 -; SI-NEXT: s_addk_i32 s9, 0x300 -; SI-NEXT: s_addk_i32 s10, 0x300 -; SI-NEXT: s_addk_i32 s11, 0x300 -; SI-NEXT: s_addk_i32 s12, 0x300 -; SI-NEXT: s_addk_i32 s13, 0x300 -; SI-NEXT: s_addk_i32 s14, 0x300 -; SI-NEXT: s_addk_i32 s15, 0x300 -; SI-NEXT: s_addk_i32 s40, 0x300 -; SI-NEXT: s_addk_i32 s41, 0x300 -; SI-NEXT: s_addk_i32 s42, 0x300 -; SI-NEXT: s_addk_i32 s43, 0x300 -; SI-NEXT: s_addk_i32 s44, 0x300 -; SI-NEXT: s_addk_i32 s45, 0x300 -; SI-NEXT: s_addk_i32 s46, 0x300 -; SI-NEXT: s_addk_i32 s47, 0x300 -; SI-NEXT: s_addk_i32 s56, 0x300 -; SI-NEXT: s_addk_i32 s57, 0x300 -; SI-NEXT: s_addk_i32 s58, 0x300 -; SI-NEXT: s_addk_i32 s59, 0x300 -; SI-NEXT: s_addk_i32 s60, 0x300 -; SI-NEXT: s_addk_i32 s28, 0x300 -; SI-NEXT: s_addk_i32 s26, 0x300 -; SI-NEXT: s_addk_i32 s24, 0x300 -; SI-NEXT: s_addk_i32 s22, 0x300 -; SI-NEXT: s_addk_i32 s20, 0x300 -; SI-NEXT: s_addk_i32 s18, 0x300 +; SI-NEXT: v_readlane_b32 s17, v41, 0 +; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: .LBB107_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v22 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v25 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v30 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_readlane_b32 s39, v32, 7 -; SI-NEXT: v_readlane_b32 s38, v32, 6 -; SI-NEXT: v_readlane_b32 s37, v32, 5 -; SI-NEXT: v_readlane_b32 s36, v32, 4 -; SI-NEXT: v_readlane_b32 s35, v32, 3 -; SI-NEXT: v_readlane_b32 s34, v32, 2 -; SI-NEXT: v_readlane_b32 s31, v32, 1 -; SI-NEXT: v_readlane_b32 s30, v32, 0 -; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s12, s12, 0x3000000 +; SI-NEXT: s_add_i32 s13, s13, 0x3000000 +; SI-NEXT: s_add_i32 s43, s16, 0x3000000 +; SI-NEXT: s_add_i32 s14, s14, 0x3000000 +; SI-NEXT: s_add_i32 s15, s15, 0x3000000 +; SI-NEXT: s_lshr_b64 s[58:59], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 9 +; SI-NEXT: s_lshr_b32 s23, s43, 16 +; SI-NEXT: s_lshr_b32 s57, s41, 16 +; SI-NEXT: s_lshr_b32 s59, s15, 16 +; SI-NEXT: s_lshr_b32 s61, s13, 16 +; SI-NEXT: s_lshr_b32 s63, s11, 16 +; SI-NEXT: s_lshr_b32 s73, s9, 16 +; SI-NEXT: s_lshr_b32 s75, s7, 16 +; SI-NEXT: s_lshr_b32 s45, s5, 16 +; SI-NEXT: v_writelane_b32 v41, s17, 10 +; SI-NEXT: .LBB107_5: ; %end +; SI-NEXT: s_and_b32 s16, s42, 0xffff +; SI-NEXT: s_lshl_b32 s17, s58, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s43, 0xffff +; SI-NEXT: s_lshl_b32 s18, s23, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s40, 0xffff +; SI-NEXT: s_lshl_b32 s19, s48, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s41, 0xffff +; SI-NEXT: s_lshl_b32 s20, s57, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s20, s76, 16 +; SI-NEXT: s_or_b32 s14, s14, s20 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s20, s59, 16 +; SI-NEXT: s_or_b32 s15, s15, s20 +; SI-NEXT: v_readlane_b32 s20, v41, 9 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_or_b32 s12, s12, s20 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s20, s61, 16 +; SI-NEXT: s_or_b32 s13, s13, s20 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s20, s60, 16 +; SI-NEXT: s_or_b32 s10, s10, s20 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s20, s63, 16 +; SI-NEXT: s_or_b32 s11, s11, s20 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s20, s62, 16 +; SI-NEXT: s_or_b32 s8, s8, s20 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s20, s73, 16 +; SI-NEXT: s_or_b32 s9, s9, s20 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s20, s72, 16 +; SI-NEXT: s_or_b32 s6, s6, s20 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s20, s75, 16 +; SI-NEXT: s_or_b32 s7, s7, s20 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s20, s74, 16 +; SI-NEXT: s_or_b32 s4, s4, s20 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s20, s45, 16 +; SI-NEXT: s_or_b32 s5, s5, s20 +; SI-NEXT: v_readlane_b32 s21, v41, 10 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s14 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s10 +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: v_mov_b32_e32 v13, s7 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: v_mov_b32_e32 v15, s5 +; SI-NEXT: v_readlane_b32 s99, v40, 35 +; SI-NEXT: v_readlane_b32 s98, v40, 34 +; SI-NEXT: v_readlane_b32 s97, v40, 33 +; SI-NEXT: v_readlane_b32 s96, v40, 32 +; SI-NEXT: v_readlane_b32 s87, v40, 31 +; SI-NEXT: v_readlane_b32 s86, v40, 30 +; SI-NEXT: v_readlane_b32 s85, v40, 29 +; SI-NEXT: v_readlane_b32 s84, v40, 28 +; SI-NEXT: v_readlane_b32 s83, v40, 27 +; SI-NEXT: v_readlane_b32 s82, v40, 26 +; SI-NEXT: v_readlane_b32 s81, v40, 25 +; SI-NEXT: v_readlane_b32 s80, v40, 24 +; SI-NEXT: v_readlane_b32 s71, v40, 23 +; SI-NEXT: v_readlane_b32 s70, v40, 22 +; SI-NEXT: v_readlane_b32 s69, v40, 21 +; SI-NEXT: v_readlane_b32 s68, v40, 20 +; SI-NEXT: v_readlane_b32 s67, v40, 19 +; SI-NEXT: v_readlane_b32 s66, v40, 18 +; SI-NEXT: v_readlane_b32 s65, v40, 17 +; SI-NEXT: v_readlane_b32 s64, v40, 16 +; SI-NEXT: v_readlane_b32 s55, v40, 15 +; SI-NEXT: v_readlane_b32 s54, v40, 14 +; SI-NEXT: v_readlane_b32 s53, v40, 13 +; SI-NEXT: v_readlane_b32 s52, v40, 12 +; SI-NEXT: v_readlane_b32 s51, v40, 11 +; SI-NEXT: v_readlane_b32 s50, v40, 10 +; SI-NEXT: v_readlane_b32 s49, v40, 9 +; SI-NEXT: v_readlane_b32 s48, v40, 8 +; SI-NEXT: v_readlane_b32 s39, v40, 7 +; SI-NEXT: v_readlane_b32 s38, v40, 6 +; SI-NEXT: v_readlane_b32 s37, v40, 5 +; SI-NEXT: v_readlane_b32 s36, v40, 4 +; SI-NEXT: v_readlane_b32 s35, v40, 3 +; SI-NEXT: v_readlane_b32 s34, v40, 2 +; SI-NEXT: v_readlane_b32 s31, v40, 1 +; SI-NEXT: v_readlane_b32 s30, v40, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB107_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_branch .LBB107_2 ; ; VI-LABEL: bitcast_v64i8_to_v32f16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll index 36caff3752e26..361a93919fed7 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll @@ -4722,286 +4722,142 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v28, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v28, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v52 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v51 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v39 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v22 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v34 +; SI-NEXT: v_or_b32_e32 v4, v4, v25 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v22 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v10, v10, v21 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v20 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: v_or_b32_e32 v3, v3, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v25 +; SI-NEXT: v_or_b32_e32 v7, v7, v23 +; SI-NEXT: v_or_b32_e32 v9, v9, v22 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i32_to_v36f16: @@ -5410,85 +5266,47 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v17, s28 ; SI-NEXT: v_mov_b32_e32 v18, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_readfirstlane_b32 s23, v5 -; SI-NEXT: v_readfirstlane_b32 s22, v6 -; SI-NEXT: v_readfirstlane_b32 s21, v7 -; SI-NEXT: v_readfirstlane_b32 s20, v8 -; SI-NEXT: v_readfirstlane_b32 s19, v9 -; SI-NEXT: v_readfirstlane_b32 s18, v10 -; SI-NEXT: v_readfirstlane_b32 s17, v11 -; SI-NEXT: v_readfirstlane_b32 s16, v12 -; SI-NEXT: v_readfirstlane_b32 s15, v13 -; SI-NEXT: v_readfirstlane_b32 s14, v14 -; SI-NEXT: v_readfirstlane_b32 s13, v15 -; SI-NEXT: v_readfirstlane_b32 s12, v16 -; SI-NEXT: v_readfirstlane_b32 s11, v17 -; SI-NEXT: v_readfirstlane_b32 s10, v18 -; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: v_readfirstlane_b32 s20, v5 +; SI-NEXT: v_readfirstlane_b32 s21, v6 +; SI-NEXT: v_readfirstlane_b32 s18, v7 +; SI-NEXT: v_readfirstlane_b32 s19, v8 +; SI-NEXT: v_readfirstlane_b32 s16, v9 +; SI-NEXT: v_readfirstlane_b32 s17, v10 +; SI-NEXT: v_readfirstlane_b32 s14, v11 +; SI-NEXT: v_readfirstlane_b32 s15, v12 +; SI-NEXT: v_readfirstlane_b32 s12, v13 +; SI-NEXT: v_readfirstlane_b32 s13, v14 +; SI-NEXT: v_readfirstlane_b32 s10, v15 +; SI-NEXT: v_readfirstlane_b32 s11, v16 +; SI-NEXT: v_readfirstlane_b32 s8, v17 +; SI-NEXT: v_readfirstlane_b32 s9, v18 +; SI-NEXT: v_readfirstlane_b32 s6, v0 ; SI-NEXT: v_readfirstlane_b32 s7, v1 -; SI-NEXT: v_readfirstlane_b32 s6, v2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: s_and_b64 s[22:23], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v3 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 +; SI-NEXT: s_lshr_b32 s60, s5, 16 +; SI-NEXT: s_lshr_b32 s61, s7, 16 +; SI-NEXT: s_lshr_b32 s62, s9, 16 +; SI-NEXT: s_lshr_b32 s63, s11, 16 +; SI-NEXT: s_lshr_b32 s72, s13, 16 +; SI-NEXT: s_lshr_b32 s73, s15, 16 +; SI-NEXT: s_lshr_b32 s74, s17, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s21, 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 @@ -5501,175 +5319,123 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: s_lshr_b32 s5, s22, 16 -; SI-NEXT: s_lshr_b32 s24, s21, 16 -; SI-NEXT: s_lshr_b32 s25, s20, 16 -; SI-NEXT: s_lshr_b32 s26, s19, 16 -; SI-NEXT: s_lshr_b32 s27, s18, 16 -; SI-NEXT: s_lshr_b32 s28, s17, 16 -; SI-NEXT: s_lshr_b32 s29, s16, 16 -; SI-NEXT: s_lshr_b32 s40, s15, 16 -; SI-NEXT: s_lshr_b32 s41, s14, 16 -; SI-NEXT: s_lshr_b32 s42, s13, 16 -; SI-NEXT: s_lshr_b32 s43, s12, 16 -; SI-NEXT: s_lshr_b32 s44, s11, 16 -; SI-NEXT: s_lshr_b32 s45, s10, 16 -; SI-NEXT: s_lshr_b32 s46, s8, 16 -; SI-NEXT: s_lshr_b32 s47, s7, 16 -; SI-NEXT: s_lshr_b32 s56, s6, 16 -; SI-NEXT: s_lshr_b32 s57, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v2, v32, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 -; SI-NEXT: v_or_b32_e32 v5, v5, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v30 -; SI-NEXT: v_or_b32_e32 v7, v7, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_or_b32_e32 v9, v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; SI-NEXT: v_or_b32_e32 v11, v24, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 -; SI-NEXT: v_or_b32_e32 v13, v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v15, v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v3, v34, v3 -; SI-NEXT: v_or_b32_e32 v4, v31, v4 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: v_or_b32_e32 v8, v27, v8 -; SI-NEXT: v_or_b32_e32 v10, v25, v10 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v14, v21, v14 -; SI-NEXT: v_or_b32_e32 v16, v19, v16 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 +; SI-NEXT: s_lshr_b32 s60, s5, 16 +; SI-NEXT: s_lshr_b32 s61, s7, 16 +; SI-NEXT: s_lshr_b32 s62, s9, 16 +; SI-NEXT: s_lshr_b32 s63, s11, 16 +; SI-NEXT: s_lshr_b32 s72, s13, 16 +; SI-NEXT: s_lshr_b32 s73, s15, 16 +; SI-NEXT: s_lshr_b32 s74, s17, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s21, 16 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_lshl_b32 s23, s56, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s23 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s23, s76, 16 +; SI-NEXT: s_or_b32 s21, s21, s23 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s23, s46, 16 +; SI-NEXT: s_or_b32 s18, s18, s23 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s23, s75, 16 +; SI-NEXT: s_or_b32 s19, s19, s23 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s23, s44, 16 +; SI-NEXT: s_or_b32 s16, s16, s23 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s23, s74, 16 +; SI-NEXT: s_or_b32 s17, s17, s23 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s23, s42, 16 +; SI-NEXT: s_or_b32 s14, s14, s23 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s23, s73, 16 +; SI-NEXT: s_or_b32 s15, s15, s23 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s23, s40, 16 +; SI-NEXT: s_or_b32 s12, s12, s23 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s23, s72, 16 +; SI-NEXT: s_or_b32 s13, s13, s23 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s23, s28, 16 +; SI-NEXT: s_or_b32 s10, s10, s23 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s23, s63, 16 +; SI-NEXT: s_or_b32 s11, s11, s23 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s23, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s23 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s23, s62, 16 +; SI-NEXT: s_or_b32 s9, s9, s23 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s23, s24, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_or_b32 s6, s6, s23 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s23, s61, 16 +; SI-NEXT: s_or_b32 s4, s4, s22 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s22, s60, 16 +; SI-NEXT: s_or_b32 s7, s7, s23 +; SI-NEXT: s_or_b32 s5, s5, s22 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_mov_b32_e32 v1, s21 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s10 +; SI-NEXT: v_mov_b32_e32 v11, s11 +; SI-NEXT: v_mov_b32_e32 v12, s8 +; SI-NEXT: v_mov_b32_e32 v13, s9 +; SI-NEXT: v_mov_b32_e32 v14, s6 +; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v16, s4 +; SI-NEXT: v_mov_b32_e32 v17, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v18i32_to_v36f16_scalar: @@ -6152,68 +5918,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v18i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v32, v17 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -6230,101 +5935,102 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v41, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v41 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v62, v2 -; SI-NEXT: v_or_b32_e32 v3, v60, v3 -; SI-NEXT: v_or_b32_e32 v4, v58, v4 -; SI-NEXT: v_or_b32_e32 v5, v56, v5 -; SI-NEXT: v_or_b32_e32 v6, v46, v6 -; SI-NEXT: v_or_b32_e32 v7, v44, v7 -; SI-NEXT: v_or_b32_e32 v8, v42, v8 -; SI-NEXT: v_or_b32_e32 v9, v40, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v52, v11 -; SI-NEXT: v_or_b32_e32 v12, v50, v12 -; SI-NEXT: v_or_b32_e32 v13, v48, v13 -; SI-NEXT: v_or_b32_e32 v14, v38, v14 -; SI-NEXT: v_or_b32_e32 v15, v36, v15 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v62 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v56 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v47 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v35 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 @@ -6339,20 +6045,44 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 @@ -6361,10 +6091,10 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -6373,14 +6103,15 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -6389,32 +6120,32 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v52 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -6423,10 +6154,10 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -6434,29 +6165,30 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -6464,10 +6196,10 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -6475,40 +6207,38 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v35 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 @@ -7003,160 +6733,106 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-LABEL: bitcast_v36f16_to_v18i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s12 -; SI-NEXT: s_lshr_b32 s12, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 -; SI-NEXT: s_lshr_b32 s12, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 -; SI-NEXT: s_lshr_b32 s12, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s12 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v45, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 -; SI-NEXT: s_lshr_b32 s12, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 -; SI-NEXT: s_lshr_b32 s12, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v34, v1 +; SI-NEXT: v_mov_b32_e32 v35, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v34 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v35 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v44, v2 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_or_b32_e32 v11, v29, v11 -; SI-NEXT: v_or_b32_e32 v12, v28, v12 -; SI-NEXT: v_or_b32_e32 v13, v26, v13 -; SI-NEXT: v_or_b32_e32 v14, v24, v14 -; SI-NEXT: v_or_b32_e32 v15, v22, v15 -; SI-NEXT: v_or_b32_e32 v16, v20, v16 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -7169,10 +6845,10 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -7181,199 +6857,126 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v25 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v26 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v22 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: .LBB19_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v42 -; SI-NEXT: v_mov_b32_e32 v42, v36 -; SI-NEXT: v_mov_b32_e32 v36, v19 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v34, v43 -; SI-NEXT: v_mov_b32_e32 v43, v37 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v33, v44 -; SI-NEXT: v_mov_b32_e32 v44, v38 -; SI-NEXT: v_mov_b32_e32 v38, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v45 -; SI-NEXT: v_mov_b32_e32 v45, v39 -; SI-NEXT: v_mov_b32_e32 v39, v21 -; SI-NEXT: v_mov_b32_e32 v48, v22 -; SI-NEXT: v_mov_b32_e32 v49, v23 -; SI-NEXT: v_mov_b32_e32 v50, v24 -; SI-NEXT: v_mov_b32_e32 v51, v25 -; SI-NEXT: v_mov_b32_e32 v52, v26 -; SI-NEXT: v_mov_b32_e32 v53, v27 -; SI-NEXT: v_mov_b32_e32 v54, v28 -; SI-NEXT: v_mov_b32_e32 v55, v29 -; SI-NEXT: v_mov_b32_e32 v40, v30 -; SI-NEXT: v_mov_b32_e32 v41, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v21, v39 -; SI-NEXT: v_mov_b32_e32 v39, v45 -; SI-NEXT: v_mov_b32_e32 v45, v32 -; SI-NEXT: v_mov_b32_e32 v20, v38 -; SI-NEXT: v_mov_b32_e32 v38, v44 -; SI-NEXT: v_mov_b32_e32 v44, v33 -; SI-NEXT: v_mov_b32_e32 v18, v37 -; SI-NEXT: v_mov_b32_e32 v37, v43 -; SI-NEXT: v_mov_b32_e32 v43, v34 -; SI-NEXT: v_mov_b32_e32 v19, v36 -; SI-NEXT: v_mov_b32_e32 v36, v42 -; SI-NEXT: v_mov_b32_e32 v42, v35 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v41 -; SI-NEXT: v_mov_b32_e32 v30, v40 -; SI-NEXT: v_mov_b32_e32 v29, v55 -; SI-NEXT: v_mov_b32_e32 v28, v54 -; SI-NEXT: v_mov_b32_e32 v27, v53 -; SI-NEXT: v_mov_b32_e32 v26, v52 -; SI-NEXT: v_mov_b32_e32 v25, v51 -; SI-NEXT: v_mov_b32_e32 v24, v50 -; SI-NEXT: v_mov_b32_e32 v23, v49 -; SI-NEXT: v_mov_b32_e32 v22, v48 ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v36f16_to_v18i32_scalar: @@ -11846,305 +11449,161 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v28, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v28, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v52 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v51 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v39 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v22 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v18f32_to_v36f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v34 +; SI-NEXT: v_or_b32_e32 v4, v4, v25 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v22 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v10, v10, v21 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v20 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: v_or_b32_e32 v3, v3, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v25 +; SI-NEXT: v_or_b32_e32 v7, v7, v23 +; SI-NEXT: v_or_b32_e32 v9, v9, v22 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v18f32_to_v36f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 @@ -12502,276 +11961,158 @@ define inreg <36 x half> @bitcast_v18f32_to_v36f16_scalar(<18 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v45, s16 -; SI-NEXT: v_mov_b32_e32 v44, s17 -; SI-NEXT: v_mov_b32_e32 v43, s18 -; SI-NEXT: v_mov_b32_e32 v42, s19 -; SI-NEXT: v_mov_b32_e32 v41, s20 -; SI-NEXT: v_mov_b32_e32 v40, s21 -; SI-NEXT: v_mov_b32_e32 v55, s22 -; SI-NEXT: v_mov_b32_e32 v54, s23 -; SI-NEXT: v_mov_b32_e32 v53, s24 -; SI-NEXT: v_mov_b32_e32 v52, s25 -; SI-NEXT: v_mov_b32_e32 v50, s26 -; SI-NEXT: v_mov_b32_e32 v49, s27 -; SI-NEXT: v_mov_b32_e32 v48, s28 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v51, s29 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v45 +; SI-NEXT: v_lshr_b64 v[26:27], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[18:19], 16 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v4, 1.0, v45 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v44 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v43 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v42 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v41 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v40 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v55 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v54 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v53 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v52 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v50 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v48 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v51 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[26:27], v[12:13], 16 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[22:23], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[8:9], 16 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[23:24], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 ; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 -; SI-NEXT: v_or_b32_e32 v5, v5, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v30 -; SI-NEXT: v_or_b32_e32 v7, v7, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_or_b32_e32 v9, v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; SI-NEXT: v_or_b32_e32 v11, v24, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 -; SI-NEXT: v_or_b32_e32 v13, v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 -; SI-NEXT: v_or_b32_e32 v15, v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; SI-NEXT: v_or_b32_e32 v4, v31, v4 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: v_or_b32_e32 v8, v27, v8 -; SI-NEXT: v_or_b32_e32 v10, v25, v10 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v14, v21, v14 -; SI-NEXT: v_or_b32_e32 v16, v19, v16 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v20, v18, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v39 +; SI-NEXT: v_or_b32_e32 v21, v5, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v18, v16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v38 +; SI-NEXT: v_or_b32_e32 v19, v5, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v20 +; SI-NEXT: v_mov_b32_e32 v1, v21 +; SI-NEXT: v_mov_b32_e32 v2, v18 +; SI-NEXT: v_mov_b32_e32 v3, v19 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v18f32_to_v36f16_scalar: @@ -13354,68 +12695,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v18f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v32, v17 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -13432,101 +12712,102 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v41, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v41 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v62, v2 -; SI-NEXT: v_or_b32_e32 v3, v60, v3 -; SI-NEXT: v_or_b32_e32 v4, v58, v4 -; SI-NEXT: v_or_b32_e32 v5, v56, v5 -; SI-NEXT: v_or_b32_e32 v6, v46, v6 -; SI-NEXT: v_or_b32_e32 v7, v44, v7 -; SI-NEXT: v_or_b32_e32 v8, v42, v8 -; SI-NEXT: v_or_b32_e32 v9, v40, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v52, v11 -; SI-NEXT: v_or_b32_e32 v12, v50, v12 -; SI-NEXT: v_or_b32_e32 v13, v48, v13 -; SI-NEXT: v_or_b32_e32 v14, v38, v14 -; SI-NEXT: v_or_b32_e32 v15, v36, v15 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v62 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v56 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v47 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v35 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 @@ -13541,20 +12822,44 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 @@ -13563,10 +12868,10 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -13575,14 +12880,15 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -13591,32 +12897,32 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v52 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -13625,10 +12931,10 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -13636,29 +12942,30 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -13666,10 +12973,10 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -13677,40 +12984,38 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v35 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 @@ -14205,160 +13510,106 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-LABEL: bitcast_v36f16_to_v18f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s12 -; SI-NEXT: s_lshr_b32 s12, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 -; SI-NEXT: s_lshr_b32 s12, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 -; SI-NEXT: s_lshr_b32 s12, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s12 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v45, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 -; SI-NEXT: s_lshr_b32 s12, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 -; SI-NEXT: s_lshr_b32 s12, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v34, v1 +; SI-NEXT: v_mov_b32_e32 v35, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v34 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v35 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v44, v2 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_or_b32_e32 v11, v29, v11 -; SI-NEXT: v_or_b32_e32 v12, v28, v12 -; SI-NEXT: v_or_b32_e32 v13, v26, v13 -; SI-NEXT: v_or_b32_e32 v14, v24, v14 -; SI-NEXT: v_or_b32_e32 v15, v22, v15 -; SI-NEXT: v_or_b32_e32 v16, v20, v16 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -14371,10 +13622,10 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -14383,199 +13634,126 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v25 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v26 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v22 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: .LBB35_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v42 -; SI-NEXT: v_mov_b32_e32 v42, v36 -; SI-NEXT: v_mov_b32_e32 v36, v19 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v34, v43 -; SI-NEXT: v_mov_b32_e32 v43, v37 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v33, v44 -; SI-NEXT: v_mov_b32_e32 v44, v38 -; SI-NEXT: v_mov_b32_e32 v38, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v45 -; SI-NEXT: v_mov_b32_e32 v45, v39 -; SI-NEXT: v_mov_b32_e32 v39, v21 -; SI-NEXT: v_mov_b32_e32 v48, v22 -; SI-NEXT: v_mov_b32_e32 v49, v23 -; SI-NEXT: v_mov_b32_e32 v50, v24 -; SI-NEXT: v_mov_b32_e32 v51, v25 -; SI-NEXT: v_mov_b32_e32 v52, v26 -; SI-NEXT: v_mov_b32_e32 v53, v27 -; SI-NEXT: v_mov_b32_e32 v54, v28 -; SI-NEXT: v_mov_b32_e32 v55, v29 -; SI-NEXT: v_mov_b32_e32 v40, v30 -; SI-NEXT: v_mov_b32_e32 v41, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v21, v39 -; SI-NEXT: v_mov_b32_e32 v39, v45 -; SI-NEXT: v_mov_b32_e32 v45, v32 -; SI-NEXT: v_mov_b32_e32 v20, v38 -; SI-NEXT: v_mov_b32_e32 v38, v44 -; SI-NEXT: v_mov_b32_e32 v44, v33 -; SI-NEXT: v_mov_b32_e32 v18, v37 -; SI-NEXT: v_mov_b32_e32 v37, v43 -; SI-NEXT: v_mov_b32_e32 v43, v34 -; SI-NEXT: v_mov_b32_e32 v19, v36 -; SI-NEXT: v_mov_b32_e32 v36, v42 -; SI-NEXT: v_mov_b32_e32 v42, v35 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v41 -; SI-NEXT: v_mov_b32_e32 v30, v40 -; SI-NEXT: v_mov_b32_e32 v29, v55 -; SI-NEXT: v_mov_b32_e32 v28, v54 -; SI-NEXT: v_mov_b32_e32 v27, v53 -; SI-NEXT: v_mov_b32_e32 v26, v52 -; SI-NEXT: v_mov_b32_e32 v25, v51 -; SI-NEXT: v_mov_b32_e32 v24, v50 -; SI-NEXT: v_mov_b32_e32 v23, v49 -; SI-NEXT: v_mov_b32_e32 v22, v48 ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v36f16_to_v18f32_scalar: @@ -18207,127 +17385,46 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v27, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 @@ -18350,143 +17447,80 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v27, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v52 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v51 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v39 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v22 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v34 +; SI-NEXT: v_or_b32_e32 v4, v4, v25 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v22 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v10, v10, v21 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v20 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: v_or_b32_e32 v3, v3, v27 +; SI-NEXT: v_or_b32_e32 v5, v5, v25 +; SI-NEXT: v_or_b32_e32 v7, v7, v23 +; SI-NEXT: v_or_b32_e32 v9, v9, v22 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i64_to_v36f16: @@ -18905,266 +17939,176 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v17, s28 ; SI-NEXT: v_mov_b32_e32 v18, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_readfirstlane_b32 s22, v5 -; SI-NEXT: v_readfirstlane_b32 s23, v6 -; SI-NEXT: v_readfirstlane_b32 s20, v7 -; SI-NEXT: v_readfirstlane_b32 s21, v8 -; SI-NEXT: v_readfirstlane_b32 s18, v9 -; SI-NEXT: v_readfirstlane_b32 s19, v10 -; SI-NEXT: v_readfirstlane_b32 s16, v11 -; SI-NEXT: v_readfirstlane_b32 s17, v12 -; SI-NEXT: v_readfirstlane_b32 s14, v13 -; SI-NEXT: v_readfirstlane_b32 s15, v14 -; SI-NEXT: v_readfirstlane_b32 s12, v15 -; SI-NEXT: v_readfirstlane_b32 s13, v16 -; SI-NEXT: v_readfirstlane_b32 s10, v17 -; SI-NEXT: v_readfirstlane_b32 s11, v18 -; SI-NEXT: v_readfirstlane_b32 s7, v0 -; SI-NEXT: v_readfirstlane_b32 s8, v1 -; SI-NEXT: v_readfirstlane_b32 s6, v2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_readfirstlane_b32 s20, v5 +; SI-NEXT: v_readfirstlane_b32 s21, v6 +; SI-NEXT: v_readfirstlane_b32 s18, v7 +; SI-NEXT: v_readfirstlane_b32 s19, v8 +; SI-NEXT: v_readfirstlane_b32 s16, v9 +; SI-NEXT: v_readfirstlane_b32 s17, v10 +; SI-NEXT: v_readfirstlane_b32 s14, v11 +; SI-NEXT: v_readfirstlane_b32 s15, v12 +; SI-NEXT: v_readfirstlane_b32 s12, v13 +; SI-NEXT: v_readfirstlane_b32 s13, v14 +; SI-NEXT: v_readfirstlane_b32 s10, v15 +; SI-NEXT: v_readfirstlane_b32 s11, v16 +; SI-NEXT: v_readfirstlane_b32 s8, v17 +; SI-NEXT: v_readfirstlane_b32 s9, v18 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: s_and_b64 s[22:23], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v3 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s22 +; SI-NEXT: s_lshr_b32 s60, s5, 16 +; SI-NEXT: s_lshr_b32 s61, s7, 16 +; SI-NEXT: s_lshr_b32 s62, s9, 16 +; SI-NEXT: s_lshr_b32 s63, s11, 16 +; SI-NEXT: s_lshr_b32 s72, s13, 16 +; SI-NEXT: s_lshr_b32 s73, s15, 16 +; SI-NEXT: s_lshr_b32 s74, s17, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s21, 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s22, 3 -; SI-NEXT: s_addc_u32 s5, s23, 0 -; SI-NEXT: s_lshr_b32 s22, s4, 16 -; SI-NEXT: s_lshr_b32 s23, s5, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s24, s20, 16 -; SI-NEXT: s_lshr_b32 s25, s21, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s26, s18, 16 -; SI-NEXT: s_lshr_b32 s27, s19, 16 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s28, s16, 16 -; SI-NEXT: s_lshr_b32 s29, s17, 16 -; SI-NEXT: s_add_u32 s14, s14, 3 -; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_lshr_b32 s40, s14, 16 -; SI-NEXT: s_lshr_b32 s41, s15, 16 -; SI-NEXT: s_add_u32 s12, s12, 3 -; SI-NEXT: s_addc_u32 s13, s13, 0 -; SI-NEXT: s_lshr_b32 s42, s12, 16 -; SI-NEXT: s_lshr_b32 s43, s13, 16 -; SI-NEXT: s_add_u32 s10, s10, 3 -; SI-NEXT: s_addc_u32 s11, s11, 0 -; SI-NEXT: s_lshr_b32 s44, s10, 16 -; SI-NEXT: s_lshr_b32 s45, s11, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_lshr_b32 s46, s7, 16 -; SI-NEXT: s_lshr_b32 s47, s8, 16 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_lshr_b32 s56, s6, 16 -; SI-NEXT: s_lshr_b32 s57, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s22 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s60, s5, 16 +; SI-NEXT: s_lshr_b32 s61, s7, 16 +; SI-NEXT: s_lshr_b32 s62, s9, 16 +; SI-NEXT: s_lshr_b32 s63, s11, 16 +; SI-NEXT: s_lshr_b32 s72, s13, 16 +; SI-NEXT: s_lshr_b32 s73, s15, 16 +; SI-NEXT: s_lshr_b32 s74, s17, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s21, 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_or_b32_e32 v2, v32, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 -; SI-NEXT: v_or_b32_e32 v5, v5, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v30 -; SI-NEXT: v_or_b32_e32 v7, v7, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_or_b32_e32 v9, v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; SI-NEXT: v_or_b32_e32 v11, v24, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 -; SI-NEXT: v_or_b32_e32 v13, v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v15, v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v3, v34, v3 -; SI-NEXT: v_or_b32_e32 v4, v31, v4 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: v_or_b32_e32 v8, v27, v8 -; SI-NEXT: v_or_b32_e32 v10, v25, v10 -; SI-NEXT: v_or_b32_e32 v12, v23, v12 -; SI-NEXT: v_or_b32_e32 v14, v21, v14 -; SI-NEXT: v_or_b32_e32 v16, v19, v16 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: s_lshl_b32 s23, s56, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s23 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s23, s76, 16 +; SI-NEXT: s_or_b32 s21, s21, s23 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s23, s46, 16 +; SI-NEXT: s_or_b32 s18, s18, s23 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s23, s75, 16 +; SI-NEXT: s_or_b32 s19, s19, s23 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s23, s44, 16 +; SI-NEXT: s_or_b32 s16, s16, s23 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s23, s74, 16 +; SI-NEXT: s_or_b32 s17, s17, s23 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s23, s42, 16 +; SI-NEXT: s_or_b32 s14, s14, s23 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s23, s73, 16 +; SI-NEXT: s_or_b32 s15, s15, s23 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s23, s40, 16 +; SI-NEXT: s_or_b32 s12, s12, s23 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s23, s72, 16 +; SI-NEXT: s_or_b32 s13, s13, s23 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s23, s28, 16 +; SI-NEXT: s_or_b32 s10, s10, s23 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s23, s63, 16 +; SI-NEXT: s_or_b32 s11, s11, s23 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s23, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s23 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s23, s62, 16 +; SI-NEXT: s_or_b32 s9, s9, s23 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s23, s24, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_or_b32 s6, s6, s23 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s23, s61, 16 +; SI-NEXT: s_or_b32 s4, s4, s22 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s22, s60, 16 +; SI-NEXT: s_or_b32 s7, s7, s23 +; SI-NEXT: s_or_b32 s5, s5, s22 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_mov_b32_e32 v1, s21 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s10 +; SI-NEXT: v_mov_b32_e32 v11, s11 +; SI-NEXT: v_mov_b32_e32 v12, s8 +; SI-NEXT: v_mov_b32_e32 v13, s9 +; SI-NEXT: v_mov_b32_e32 v14, s6 +; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v16, s4 +; SI-NEXT: v_mov_b32_e32 v17, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v9i64_to_v36f16_scalar: @@ -19647,68 +18591,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v9i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v32, v17 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -19725,101 +18608,102 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v41, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v41 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v62, v2 -; SI-NEXT: v_or_b32_e32 v3, v60, v3 -; SI-NEXT: v_or_b32_e32 v4, v58, v4 -; SI-NEXT: v_or_b32_e32 v5, v56, v5 -; SI-NEXT: v_or_b32_e32 v6, v46, v6 -; SI-NEXT: v_or_b32_e32 v7, v44, v7 -; SI-NEXT: v_or_b32_e32 v8, v42, v8 -; SI-NEXT: v_or_b32_e32 v9, v40, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v52, v11 -; SI-NEXT: v_or_b32_e32 v12, v50, v12 -; SI-NEXT: v_or_b32_e32 v13, v48, v13 -; SI-NEXT: v_or_b32_e32 v14, v38, v14 -; SI-NEXT: v_or_b32_e32 v15, v36, v15 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v62 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v56 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v47 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v35 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 @@ -19834,20 +18718,44 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 @@ -19856,10 +18764,10 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -19868,14 +18776,15 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -19884,32 +18793,32 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v52 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -19918,10 +18827,10 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -19929,29 +18838,30 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -19959,10 +18869,10 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -19970,40 +18880,38 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v35 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 @@ -20498,160 +19406,106 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-LABEL: bitcast_v36f16_to_v9i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s12 -; SI-NEXT: s_lshr_b32 s12, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 -; SI-NEXT: s_lshr_b32 s12, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 -; SI-NEXT: s_lshr_b32 s12, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s12 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v45, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 -; SI-NEXT: s_lshr_b32 s12, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 -; SI-NEXT: s_lshr_b32 s12, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v34, v1 +; SI-NEXT: v_mov_b32_e32 v35, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v34 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v35 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v44, v2 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_or_b32_e32 v11, v29, v11 -; SI-NEXT: v_or_b32_e32 v12, v28, v12 -; SI-NEXT: v_or_b32_e32 v13, v26, v13 -; SI-NEXT: v_or_b32_e32 v14, v24, v14 -; SI-NEXT: v_or_b32_e32 v15, v22, v15 -; SI-NEXT: v_or_b32_e32 v16, v20, v16 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -20664,10 +19518,10 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -20676,199 +19530,126 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v25 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v26 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v22 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v42 -; SI-NEXT: v_mov_b32_e32 v42, v36 -; SI-NEXT: v_mov_b32_e32 v36, v19 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v34, v43 -; SI-NEXT: v_mov_b32_e32 v43, v37 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v33, v44 -; SI-NEXT: v_mov_b32_e32 v44, v38 -; SI-NEXT: v_mov_b32_e32 v38, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v45 -; SI-NEXT: v_mov_b32_e32 v45, v39 -; SI-NEXT: v_mov_b32_e32 v39, v21 -; SI-NEXT: v_mov_b32_e32 v48, v22 -; SI-NEXT: v_mov_b32_e32 v49, v23 -; SI-NEXT: v_mov_b32_e32 v50, v24 -; SI-NEXT: v_mov_b32_e32 v51, v25 -; SI-NEXT: v_mov_b32_e32 v52, v26 -; SI-NEXT: v_mov_b32_e32 v53, v27 -; SI-NEXT: v_mov_b32_e32 v54, v28 -; SI-NEXT: v_mov_b32_e32 v55, v29 -; SI-NEXT: v_mov_b32_e32 v40, v30 -; SI-NEXT: v_mov_b32_e32 v41, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v21, v39 -; SI-NEXT: v_mov_b32_e32 v39, v45 -; SI-NEXT: v_mov_b32_e32 v45, v32 -; SI-NEXT: v_mov_b32_e32 v20, v38 -; SI-NEXT: v_mov_b32_e32 v38, v44 -; SI-NEXT: v_mov_b32_e32 v44, v33 -; SI-NEXT: v_mov_b32_e32 v18, v37 -; SI-NEXT: v_mov_b32_e32 v37, v43 -; SI-NEXT: v_mov_b32_e32 v43, v34 -; SI-NEXT: v_mov_b32_e32 v19, v36 -; SI-NEXT: v_mov_b32_e32 v36, v42 -; SI-NEXT: v_mov_b32_e32 v42, v35 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v41 -; SI-NEXT: v_mov_b32_e32 v30, v40 -; SI-NEXT: v_mov_b32_e32 v29, v55 -; SI-NEXT: v_mov_b32_e32 v28, v54 -; SI-NEXT: v_mov_b32_e32 v27, v53 -; SI-NEXT: v_mov_b32_e32 v26, v52 -; SI-NEXT: v_mov_b32_e32 v25, v51 -; SI-NEXT: v_mov_b32_e32 v24, v50 -; SI-NEXT: v_mov_b32_e32 v23, v49 -; SI-NEXT: v_mov_b32_e32 v22, v48 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v36f16_to_v9i64_scalar: @@ -23774,118 +22555,46 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v26, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v29, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v31, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 @@ -23897,145 +22606,82 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v19, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v20, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v26, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v29, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v31, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v52 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v51 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v39 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v22 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v29 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v34 +; SI-NEXT: v_or_b32_e32 v4, v4, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v22 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v10, v10, v21 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 +; SI-NEXT: v_or_b32_e32 v12, v12, v20 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v31 +; SI-NEXT: v_or_b32_e32 v3, v3, v29 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 +; SI-NEXT: v_or_b32_e32 v7, v7, v23 +; SI-NEXT: v_or_b32_e32 v9, v9, v22 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f64_to_v36f16: @@ -24394,274 +23040,149 @@ define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_mov_b32_e32 v17, s16 -; SI-NEXT: v_mov_b32_e32 v18, s17 -; SI-NEXT: v_mov_b32_e32 v15, s18 -; SI-NEXT: v_mov_b32_e32 v16, s19 -; SI-NEXT: v_mov_b32_e32 v13, s20 -; SI-NEXT: v_mov_b32_e32 v14, s21 -; SI-NEXT: v_mov_b32_e32 v11, s22 -; SI-NEXT: v_mov_b32_e32 v12, s23 -; SI-NEXT: v_mov_b32_e32 v9, s24 -; SI-NEXT: v_mov_b32_e32 v10, s25 -; SI-NEXT: v_mov_b32_e32 v7, s26 -; SI-NEXT: v_mov_b32_e32 v8, s27 -; SI-NEXT: v_mov_b32_e32 v5, s28 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v6, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v11 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 +; SI-NEXT: v_lshr_b64 v[26:27], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[18:19], 16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[4:5], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v52 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v51 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v49 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v48 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v36 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v37 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v32 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v33 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v28 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v29 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v23 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v25 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v22 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_lshr_b64 v[26:27], v[12:13], 16 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[22:23], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v20, v18, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v39 +; SI-NEXT: v_or_b32_e32 v21, v5, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v18, v16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v38 +; SI-NEXT: v_or_b32_e32 v19, v5, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v20 +; SI-NEXT: v_mov_b32_e32 v1, v21 +; SI-NEXT: v_mov_b32_e32 v2, v18 +; SI-NEXT: v_mov_b32_e32 v3, v19 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v9f64_to_v36f16_scalar: @@ -25211,68 +23732,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v9f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v32, v17 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -25289,101 +23749,102 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v41, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_mov_b32_e32 v34, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v55, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v41 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 -; SI-NEXT: v_or_b32_e32 v2, v62, v2 -; SI-NEXT: v_or_b32_e32 v3, v60, v3 -; SI-NEXT: v_or_b32_e32 v4, v58, v4 -; SI-NEXT: v_or_b32_e32 v5, v56, v5 -; SI-NEXT: v_or_b32_e32 v6, v46, v6 -; SI-NEXT: v_or_b32_e32 v7, v44, v7 -; SI-NEXT: v_or_b32_e32 v8, v42, v8 -; SI-NEXT: v_or_b32_e32 v9, v40, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v52, v11 -; SI-NEXT: v_or_b32_e32 v12, v50, v12 -; SI-NEXT: v_or_b32_e32 v13, v48, v13 -; SI-NEXT: v_or_b32_e32 v14, v38, v14 -; SI-NEXT: v_or_b32_e32 v15, v36, v15 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v62 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v56 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v47 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v35 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 @@ -25398,20 +23859,44 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; kill: killed $vgpr18 @@ -25420,10 +23905,10 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -25432,14 +23917,15 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -25448,32 +23934,32 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v52 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -25482,10 +23968,10 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -25493,29 +23979,30 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -25523,10 +24010,10 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -25534,40 +24021,38 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v35 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 @@ -26062,160 +24547,106 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-LABEL: bitcast_v36f16_to_v9f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s12 -; SI-NEXT: s_lshr_b32 s12, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 -; SI-NEXT: s_lshr_b32 s12, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 -; SI-NEXT: s_lshr_b32 s12, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s12 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v45, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 -; SI-NEXT: s_lshr_b32 s12, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 -; SI-NEXT: s_lshr_b32 s12, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v34, v1 +; SI-NEXT: v_mov_b32_e32 v35, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v34 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v35 ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v44, v2 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v37, v9 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_or_b32_e32 v11, v29, v11 -; SI-NEXT: v_or_b32_e32 v12, v28, v12 -; SI-NEXT: v_or_b32_e32 v13, v26, v13 -; SI-NEXT: v_or_b32_e32 v14, v24, v14 -; SI-NEXT: v_or_b32_e32 v15, v22, v15 -; SI-NEXT: v_or_b32_e32 v16, v20, v16 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -26228,10 +24659,10 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -26240,199 +24671,126 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v37 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v25 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v26 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v22 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v42 -; SI-NEXT: v_mov_b32_e32 v42, v36 -; SI-NEXT: v_mov_b32_e32 v36, v19 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v34, v43 -; SI-NEXT: v_mov_b32_e32 v43, v37 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v33, v44 -; SI-NEXT: v_mov_b32_e32 v44, v38 -; SI-NEXT: v_mov_b32_e32 v38, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v45 -; SI-NEXT: v_mov_b32_e32 v45, v39 -; SI-NEXT: v_mov_b32_e32 v39, v21 -; SI-NEXT: v_mov_b32_e32 v48, v22 -; SI-NEXT: v_mov_b32_e32 v49, v23 -; SI-NEXT: v_mov_b32_e32 v50, v24 -; SI-NEXT: v_mov_b32_e32 v51, v25 -; SI-NEXT: v_mov_b32_e32 v52, v26 -; SI-NEXT: v_mov_b32_e32 v53, v27 -; SI-NEXT: v_mov_b32_e32 v54, v28 -; SI-NEXT: v_mov_b32_e32 v55, v29 -; SI-NEXT: v_mov_b32_e32 v40, v30 -; SI-NEXT: v_mov_b32_e32 v41, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v21, v39 -; SI-NEXT: v_mov_b32_e32 v39, v45 -; SI-NEXT: v_mov_b32_e32 v45, v32 -; SI-NEXT: v_mov_b32_e32 v20, v38 -; SI-NEXT: v_mov_b32_e32 v38, v44 -; SI-NEXT: v_mov_b32_e32 v44, v33 -; SI-NEXT: v_mov_b32_e32 v18, v37 -; SI-NEXT: v_mov_b32_e32 v37, v43 -; SI-NEXT: v_mov_b32_e32 v43, v34 -; SI-NEXT: v_mov_b32_e32 v19, v36 -; SI-NEXT: v_mov_b32_e32 v36, v42 -; SI-NEXT: v_mov_b32_e32 v42, v35 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v41 -; SI-NEXT: v_mov_b32_e32 v30, v40 -; SI-NEXT: v_mov_b32_e32 v29, v55 -; SI-NEXT: v_mov_b32_e32 v28, v54 -; SI-NEXT: v_mov_b32_e32 v27, v53 -; SI-NEXT: v_mov_b32_e32 v26, v52 -; SI-NEXT: v_mov_b32_e32 v25, v51 -; SI-NEXT: v_mov_b32_e32 v24, v50 -; SI-NEXT: v_mov_b32_e32 v23, v49 -; SI-NEXT: v_mov_b32_e32 v22, v48 ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v36f16_to_v9f64_scalar: @@ -26810,9 +25168,39 @@ end: define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v36i16_to_v36f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; kill: killed $vgpr27 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v0 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -26829,123 +25217,116 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v36 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v34 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v32 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v22 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 ; SI-NEXT: ; kill: killed $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v48, v1, v57 +; SI-NEXT: v_alignbit_b32 v1, v48, v46, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v39, v1, v58 +; SI-NEXT: v_alignbit_b32 v1, v39, v47, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v37, v1, v60 +; SI-NEXT: v_alignbit_b32 v1, v37, v56, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v34, v1, v62 +; SI-NEXT: v_alignbit_b32 v1, v34, v59, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v31, v1, v50 +; SI-NEXT: v_alignbit_b32 v1, v31, v61, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v28, v1, v52 +; SI-NEXT: v_alignbit_b32 v1, v28, v63, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v45, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v25, v1, v55 +; SI-NEXT: v_or_b32_e32 v42, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_alignbit_b32 v1, v25, v51, 16 +; SI-NEXT: v_or_b32_e32 v53, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v49, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v23, v1, v41 +; SI-NEXT: v_or_b32_e32 v38, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_alignbit_b32 v1, v23, v54, 16 +; SI-NEXT: v_or_b32_e32 v36, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v32, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v18, v1, v43 +; SI-NEXT: v_or_b32_e32 v29, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_alignbit_b32 v44, v18, v40, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v40 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -26964,201 +25345,161 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v2, v47, v2 +; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v6, v59, v6 +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v8, v61, v8 +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v2, v58, v2 +; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v63, v10 +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v60, v4 +; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v2 +; SI-NEXT: v_alignbit_b32 v0, v48, v45, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v12, v51, v12 +; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v6, v62, v6 +; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v0, v39, v42, 16 +; SI-NEXT: v_or_b32_e32 v16, v40, v16 +; SI-NEXT: v_or_b32_e32 v14, v54, v14 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v37, v53, 16 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v52, v10 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_alignbit_b32 v0, v34, v49, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v12, v55, v12 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v31, v38, 16 +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_or_b32_e32 v14, v41, v14 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v62 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v60 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v58 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v57 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v56 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v34 +; SI-NEXT: v_alignbit_b32 v0, v28, v36, 16 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: v_alignbit_b32 v0, v25, v32, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v0, v23, v29, 16 +; SI-NEXT: v_alignbit_b32 v44, v18, v26, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v53 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v49 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v52 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v31 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v35 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v43 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v42 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -27175,12 +25516,46 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v39 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -27548,249 +25923,351 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; SI-LABEL: bitcast_v36i16_to_v36f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v18, s30, 0 +; SI-NEXT: v_writelane_b32 v18, s31, 1 +; SI-NEXT: v_writelane_b32 v18, s34, 2 +; SI-NEXT: v_writelane_b32 v18, s35, 3 +; SI-NEXT: v_writelane_b32 v18, s36, 4 +; SI-NEXT: v_writelane_b32 v18, s37, 5 +; SI-NEXT: v_writelane_b32 v18, s38, 6 +; SI-NEXT: v_writelane_b32 v18, s39, 7 +; SI-NEXT: v_writelane_b32 v18, s48, 8 +; SI-NEXT: v_writelane_b32 v18, s49, 9 +; SI-NEXT: v_writelane_b32 v18, s50, 10 +; SI-NEXT: v_writelane_b32 v18, s51, 11 +; SI-NEXT: v_writelane_b32 v18, s52, 12 +; SI-NEXT: v_writelane_b32 v18, s53, 13 +; SI-NEXT: v_writelane_b32 v18, s54, 14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: s_lshr_b32 s30, s29, 16 +; SI-NEXT: s_lshr_b32 s49, s28, 16 +; SI-NEXT: s_lshr_b32 s95, s27, 16 +; SI-NEXT: s_lshr_b32 s48, s26, 16 +; SI-NEXT: s_lshr_b32 s94, s25, 16 +; SI-NEXT: s_lshr_b32 s39, s24, 16 +; SI-NEXT: s_lshr_b32 s93, s23, 16 +; SI-NEXT: s_lshr_b32 s38, s22, 16 +; SI-NEXT: s_lshr_b32 s92, s21, 16 +; SI-NEXT: s_lshr_b32 s37, s20, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 16 +; SI-NEXT: s_lshr_b32 s36, s18, 16 +; SI-NEXT: s_lshr_b32 s90, s17, 16 +; SI-NEXT: s_lshr_b32 s35, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_writelane_b32 v18, s55, 15 +; SI-NEXT: v_readfirstlane_b32 s52, v3 +; SI-NEXT: v_readfirstlane_b32 s54, v2 +; SI-NEXT: v_readfirstlane_b32 s50, v1 +; SI-NEXT: v_readfirstlane_b32 s51, v0 +; SI-NEXT: v_readfirstlane_b32 s34, v5 +; SI-NEXT: v_readfirstlane_b32 s55, v6 +; SI-NEXT: v_readfirstlane_b32 s31, v7 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; SI-NEXT: v_readfirstlane_b32 s53, v8 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v4, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v50 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s90, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s46, s35, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s91, 16 +; SI-NEXT: s_or_b32 s40, s4, s46 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s56, s36, 16 +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s92, 16 +; SI-NEXT: s_or_b32 s14, s4, s56 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s58, s37, 16 +; SI-NEXT: s_or_b32 s59, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s93, 16 +; SI-NEXT: s_or_b32 s12, s4, s58 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s60, s38, 16 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s94, 16 +; SI-NEXT: s_or_b32 s10, s4, s60 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s62, s39, 16 +; SI-NEXT: s_or_b32 s63, s5, s7 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s7, s95, 16 +; SI-NEXT: s_or_b32 s8, s4, s62 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s72, s48, 16 +; SI-NEXT: s_or_b32 s73, s5, s7 +; SI-NEXT: s_and_b32 s5, s29, 0xffff +; SI-NEXT: s_lshl_b32 s7, s30, 16 +; SI-NEXT: s_or_b32 s6, s4, s72 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s44, s49, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s50, 0xffff +; SI-NEXT: s_lshl_b32 s7, s31, 16 +; SI-NEXT: s_or_b32 s4, s4, s44 +; SI-NEXT: s_lshl_b32 s42, s53, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s52, 0xffff +; SI-NEXT: s_lshl_b32 s7, s34, 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[44:45], 16 +; SI-NEXT: s_and_b32 s44, s51, 0xffff +; SI-NEXT: s_or_b32 s79, s5, s7 +; SI-NEXT: s_lshl_b32 s78, s55, 16 +; SI-NEXT: s_or_b32 s44, s44, s42 +; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 16 +; SI-NEXT: s_and_b32 s42, s54, 0xffff +; SI-NEXT: s_mov_b32 s41, s47 +; SI-NEXT: s_lshr_b64 s[46:47], s[46:47], 16 +; SI-NEXT: s_mov_b32 s15, s57 +; SI-NEXT: s_lshr_b64 s[56:57], s[56:57], 16 +; SI-NEXT: s_mov_b32 s13, s59 +; SI-NEXT: s_lshr_b64 s[58:59], s[58:59], 16 +; SI-NEXT: s_mov_b32 s11, s61 +; SI-NEXT: s_lshr_b64 s[60:61], s[60:61], 16 +; SI-NEXT: s_mov_b32 s9, s63 +; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 16 +; SI-NEXT: s_mov_b32 s7, s73 +; SI-NEXT: s_lshr_b64 s[72:73], s[72:73], 16 +; SI-NEXT: s_mov_b32 s5, s45 +; SI-NEXT: s_mov_b32 s45, s43 +; SI-NEXT: s_or_b32 s42, s42, s78 +; SI-NEXT: s_mov_b32 s43, s79 +; SI-NEXT: s_lshr_b64 s[78:79], s[78:79], 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s54, s54, 3 +; SI-NEXT: s_and_b32 s4, s54, 0xffff +; SI-NEXT: s_lshl_b32 s5, s55, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s52, s52, 3 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s52, 0xffff +; SI-NEXT: s_lshl_b32 s5, s34, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s51, s51, 3 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s51, 0xffff +; SI-NEXT: s_lshl_b32 s5, s53, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s50, s50, 3 +; SI-NEXT: s_add_i32 s44, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s50, 0xffff +; SI-NEXT: s_lshl_b32 s5, s31, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s45, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s49, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s29, 0xffff +; SI-NEXT: s_lshl_b32 s6, s30, 16 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s26, 0xffff +; SI-NEXT: s_lshl_b32 s7, s48, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s27, 0xffff +; SI-NEXT: s_lshl_b32 s8, s95, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s39, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s25, 0xffff +; SI-NEXT: s_lshl_b32 s10, s94, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s38, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s12, s93, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s20, 0xffff +; SI-NEXT: s_lshl_b32 s13, s37, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s21, 0xffff +; SI-NEXT: s_lshl_b32 s14, s92, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s18, 0xffff +; SI-NEXT: s_lshl_b32 s15, s36, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s19, 0xffff +; SI-NEXT: s_lshl_b32 s18, s91, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: s_or_b32 s15, s18, s15 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s35, 16 +; SI-NEXT: s_or_b32 s16, s18, s16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s40, s16, 0x30000 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s90, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s41, s16, 0x30000 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[42:43], 16 +; SI-NEXT: s_lshr_b32 s90, s41, 16 +; SI-NEXT: s_lshr_b32 s91, s15, 16 +; SI-NEXT: s_lshr_b32 s92, s13, 16 +; SI-NEXT: s_lshr_b32 s93, s11, 16 +; SI-NEXT: s_lshr_b32 s94, s9, 16 +; SI-NEXT: s_lshr_b32 s95, s7, 16 +; SI-NEXT: s_lshr_b32 s30, s5, 16 +; SI-NEXT: s_lshr_b32 s31, s45, 16 +; SI-NEXT: s_lshr_b32 s34, s43, 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v19 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v27 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v16 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v24 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v35 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v28 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v37 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_and_b32 s16, s40, 0xffff +; SI-NEXT: s_lshl_b32 s17, s46, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s41, 0xffff +; SI-NEXT: s_lshl_b32 s18, s90, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s18, s56, 16 +; SI-NEXT: s_or_b32 s14, s14, s18 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s18, s91, 16 +; SI-NEXT: s_or_b32 s15, s15, s18 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s18, s58, 16 +; SI-NEXT: s_or_b32 s12, s12, s18 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s18, s92, 16 +; SI-NEXT: s_or_b32 s13, s13, s18 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s18, s60, 16 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s18, s93, 16 +; SI-NEXT: s_or_b32 s11, s11, s18 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s18, s62, 16 +; SI-NEXT: s_or_b32 s8, s8, s18 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s18, s94, 16 +; SI-NEXT: s_or_b32 s9, s9, s18 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s18, s72, 16 +; SI-NEXT: s_or_b32 s6, s6, s18 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s18, s95, 16 +; SI-NEXT: s_or_b32 s7, s7, s18 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s18, s74, 16 +; SI-NEXT: s_or_b32 s4, s4, s18 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s18, s30, 16 +; SI-NEXT: s_or_b32 s5, s5, s18 +; SI-NEXT: s_and_b32 s18, s44, 0xffff +; SI-NEXT: s_lshl_b32 s19, s76, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s45, 0xffff +; SI-NEXT: s_lshl_b32 s20, s31, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s42, 0xffff +; SI-NEXT: s_lshl_b32 s21, s78, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_and_b32 s21, s43, 0xffff +; SI-NEXT: s_lshl_b32 s22, s34, 16 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v9, s9 +; SI-NEXT: v_mov_b32_e32 v10, s6 +; SI-NEXT: v_mov_b32_e32 v11, s7 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: v_mov_b32_e32 v13, s5 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_readlane_b32 s55, v18, 15 +; SI-NEXT: v_readlane_b32 s54, v18, 14 +; SI-NEXT: v_readlane_b32 s53, v18, 13 +; SI-NEXT: v_readlane_b32 s52, v18, 12 +; SI-NEXT: v_readlane_b32 s51, v18, 11 +; SI-NEXT: v_readlane_b32 s50, v18, 10 +; SI-NEXT: v_readlane_b32 s49, v18, 9 +; SI-NEXT: v_readlane_b32 s48, v18, 8 +; SI-NEXT: v_readlane_b32 s39, v18, 7 +; SI-NEXT: v_readlane_b32 s38, v18, 6 +; SI-NEXT: v_readlane_b32 s37, v18, 5 +; SI-NEXT: v_readlane_b32 s36, v18, 4 +; SI-NEXT: v_readlane_b32 s35, v18, 3 +; SI-NEXT: v_readlane_b32 s34, v18, 2 +; SI-NEXT: v_readlane_b32 s31, v18, 1 +; SI-NEXT: v_readlane_b32 s30, v18, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v36i16_to_v36f16_scalar: @@ -28385,310 +26862,238 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v36i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v36 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v17, v17, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v15, v15, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_or_b32_e32 v13, v13, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_or_b32_e32 v11, v11, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_or_b32_e32 v9, v9, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_or_b32_e32 v7, v7, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v5, v5, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v18 ; SI-NEXT: v_or_b32_e32 v3, v3, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v36 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v0, v0, v35 -; SI-NEXT: v_or_b32_e32 v27, v27, v34 -; SI-NEXT: v_or_b32_e32 v26, v26, v33 -; SI-NEXT: v_or_b32_e32 v24, v24, v32 -; SI-NEXT: v_or_b32_e32 v23, v23, v31 -; SI-NEXT: v_or_b32_e32 v22, v22, v30 -; SI-NEXT: v_or_b32_e32 v20, v20, v29 -; SI-NEXT: v_or_b32_e32 v21, v21, v28 -; SI-NEXT: v_or_b32_e32 v19, v19, v25 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_or_b32_e32 v4, v4, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v31 +; SI-NEXT: v_or_b32_e32 v10, v10, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v27 +; SI-NEXT: v_or_b32_e32 v14, v14, v24 +; SI-NEXT: v_or_b32_e32 v16, v16, v22 ; SI-NEXT: v_alignbit_b32 v35, v1, v35, 16 ; SI-NEXT: v_alignbit_b32 v34, v3, v34, 16 ; SI-NEXT: v_alignbit_b32 v33, v5, v33, 16 ; SI-NEXT: v_alignbit_b32 v32, v7, v32, 16 ; SI-NEXT: v_alignbit_b32 v31, v9, v31, 16 -; SI-NEXT: v_alignbit_b32 v30, v11, v30, 16 -; SI-NEXT: v_alignbit_b32 v29, v13, v29, 16 -; SI-NEXT: v_alignbit_b32 v28, v15, v28, 16 -; SI-NEXT: v_alignbit_b32 v25, v17, v25, 16 +; SI-NEXT: v_alignbit_b32 v29, v11, v29, 16 +; SI-NEXT: v_alignbit_b32 v27, v13, v27, 16 +; SI-NEXT: v_alignbit_b32 v24, v15, v24, 16 +; SI-NEXT: v_alignbit_b32 v22, v17, v22, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v3, v3, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; SI-NEXT: v_or_b32_e32 v6, v6, v18 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; SI-NEXT: v_or_b32_e32 v7, v7, v18 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v18 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_or_b32_e32 v9, v9, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; SI-NEXT: v_or_b32_e32 v11, v11, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; SI-NEXT: v_or_b32_e32 v12, v12, v18 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v18 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_or_b32_e32 v14, v14, v18 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v34 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v32 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 -; SI-NEXT: v_or_b32_e32 v12, v12, v20 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 ; SI-NEXT: v_or_b32_e32 v0, v0, v35 -; SI-NEXT: v_or_b32_e32 v2, v2, v27 -; SI-NEXT: v_or_b32_e32 v4, v4, v26 -; SI-NEXT: v_or_b32_e32 v6, v6, v24 -; SI-NEXT: v_or_b32_e32 v8, v8, v23 -; SI-NEXT: v_or_b32_e32 v10, v10, v22 -; SI-NEXT: v_or_b32_e32 v14, v14, v20 -; SI-NEXT: v_or_b32_e32 v16, v16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v30 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -29057,328 +27462,289 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; SI-LABEL: bitcast_v36f16_to_v36i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s24 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: s_lshr_b32 s7, s29, 16 +; SI-NEXT: s_lshr_b32 s6, s28, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s12, s26, 16 +; SI-NEXT: s_lshr_b32 s14, s25, 16 +; SI-NEXT: s_lshr_b32 s13, s24, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s22, 16 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: s_lshr_b32 s6, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: s_lshr_b32 s7, s28, 16 -; SI-NEXT: s_lshr_b32 s8, s26, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: s_lshr_b32 s10, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s20, 16 -; SI-NEXT: s_lshr_b32 s12, s18, 16 -; SI-NEXT: s_lshr_b32 s13, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v45, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB59_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: s_cbranch_execnz .LBB59_4 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s15 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v18 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s12 +; SI-NEXT: v_or_b32_e32 v11, v5, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s23 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 +; SI-NEXT: v_or_b32_e32 v13, v5, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v9, v9, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s27 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v43 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v44, v18, v0 -; SI-NEXT: v_or_b32_e32 v42, v19, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v16, v15, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 +; SI-NEXT: v_or_b32_e32 v7, v7, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_or_b32_e32 v23, v0, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v21 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v37 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s18 +; SI-NEXT: v_or_b32_e32 v3, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v40, v19, v6 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v52 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 -; SI-NEXT: v_or_b32_e32 v43, v19, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v45 -; SI-NEXT: v_or_b32_e32 v55, v18, v6 -; SI-NEXT: v_or_b32_e32 v52, v20, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v50 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v43, v19, v4 +; SI-NEXT: v_or_b32_e32 v42, v14, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v45, v19, v10 -; SI-NEXT: v_or_b32_e32 v40, v18, v12 -; SI-NEXT: v_or_b32_e32 v53, v20, v14 -; SI-NEXT: v_or_b32_e32 v50, v21, v16 -; SI-NEXT: v_lshr_b64 v[34:35], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[2:3], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[4:5], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[6:7], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 -; SI-NEXT: v_lshr_b64 v[22:23], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[20:21], v[14:15], 16 -; SI-NEXT: v_lshr_b64 v[18:19], v[16:17], 16 -; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v55, v20, v12 +; SI-NEXT: v_or_b32_e32 v53, v21, v10 +; SI-NEXT: v_or_b32_e32 v41, v14, v15 +; SI-NEXT: v_or_b32_e32 v54, v19, v22 +; SI-NEXT: v_lshr_b64 v[34:35], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[2:3], 16 +; SI-NEXT: v_or_b32_e32 v17, v17, v0 +; SI-NEXT: v_or_b32_e32 v18, v18, v2 +; SI-NEXT: s_branch .LBB59_5 +; SI-NEXT: .LBB59_3: +; SI-NEXT: s_branch .LBB59_2 +; SI-NEXT: .LBB59_4: +; SI-NEXT: v_mov_b32_e32 v52, s7 +; SI-NEXT: v_mov_b32_e32 v38, s40 +; SI-NEXT: v_mov_b32_e32 v39, s14 +; SI-NEXT: v_mov_b32_e32 v48, s11 +; SI-NEXT: v_mov_b32_e32 v49, s10 +; SI-NEXT: v_mov_b32_e32 v50, s9 +; SI-NEXT: v_mov_b32_e32 v51, s8 +; SI-NEXT: v_mov_b32_e32 v7, s17 +; SI-NEXT: v_mov_b32_e32 v5, s19 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v54, s28 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v41, s26 +; SI-NEXT: v_mov_b32_e32 v53, s24 +; SI-NEXT: v_mov_b32_e32 v55, s22 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v42, s20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, s18 +; SI-NEXT: v_mov_b32_e32 v40, s16 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_mov_b32_e32 v16, s27 +; SI-NEXT: v_mov_b32_e32 v23, s29 +; SI-NEXT: v_mov_b32_e32 v34, s43 +; SI-NEXT: v_mov_b32_e32 v19, s42 +; SI-NEXT: v_mov_b32_e32 v32, s41 +; SI-NEXT: v_mov_b32_e32 v30, s15 +; SI-NEXT: v_mov_b32_e32 v28, s13 +; SI-NEXT: v_mov_b32_e32 v26, s12 +; SI-NEXT: v_mov_b32_e32 v24, s6 +; SI-NEXT: .LBB59_5: ; %end ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v42 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 +; SI-NEXT: v_or_b32_e32 v20, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v50 +; SI-NEXT: v_or_b32_e32 v19, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v32 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v49 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v28 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v49 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v41 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v53 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v20 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v37 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v52 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 +; SI-NEXT: v_or_b32_e32 v15, v1, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v21 +; SI-NEXT: v_or_b32_e32 v16, v1, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_or_b32_e32 v17, v1, v3 +; SI-NEXT: v_mov_b32_e32 v1, v20 +; SI-NEXT: v_mov_b32_e32 v3, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v36f16_to_v36i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll index ce06af35bf4f0..9896de3fe8c5e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll @@ -5140,327 +5140,156 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v31, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v31, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v53 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v49 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v50 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v27 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v31 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v25 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v14, v14, v22 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v16, v16, v21 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_or_b32_e32 v13, v13, v23 +; SI-NEXT: v_or_b32_e32 v15, v15, v22 +; SI-NEXT: v_or_b32_e32 v17, v17, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20i32_to_v40f16: @@ -5901,95 +5730,53 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v17, s26 ; SI-NEXT: v_mov_b32_e32 v18, s27 ; SI-NEXT: v_mov_b32_e32 v19, s28 -; SI-NEXT: v_readfirstlane_b32 s24, v7 +; SI-NEXT: v_readfirstlane_b32 s22, v7 ; SI-NEXT: v_mov_b32_e32 v7, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_readfirstlane_b32 s25, v8 -; SI-NEXT: v_readfirstlane_b32 s23, v9 -; SI-NEXT: v_readfirstlane_b32 s22, v10 -; SI-NEXT: v_readfirstlane_b32 s21, v11 -; SI-NEXT: v_readfirstlane_b32 s20, v12 -; SI-NEXT: v_readfirstlane_b32 s19, v13 -; SI-NEXT: v_readfirstlane_b32 s18, v14 -; SI-NEXT: v_readfirstlane_b32 s17, v15 -; SI-NEXT: v_readfirstlane_b32 s16, v16 -; SI-NEXT: v_readfirstlane_b32 s15, v17 -; SI-NEXT: v_readfirstlane_b32 s14, v18 -; SI-NEXT: v_readfirstlane_b32 s13, v19 -; SI-NEXT: v_readfirstlane_b32 s12, v7 -; SI-NEXT: v_readfirstlane_b32 s11, v0 -; SI-NEXT: v_readfirstlane_b32 s10, v1 -; SI-NEXT: v_readfirstlane_b32 s8, v2 +; SI-NEXT: v_readfirstlane_b32 s23, v8 +; SI-NEXT: v_readfirstlane_b32 s20, v9 +; SI-NEXT: v_readfirstlane_b32 s21, v10 +; SI-NEXT: v_readfirstlane_b32 s18, v11 +; SI-NEXT: v_readfirstlane_b32 s19, v12 +; SI-NEXT: v_readfirstlane_b32 s16, v13 +; SI-NEXT: v_readfirstlane_b32 s17, v14 +; SI-NEXT: v_readfirstlane_b32 s14, v15 +; SI-NEXT: v_readfirstlane_b32 s15, v16 +; SI-NEXT: v_readfirstlane_b32 s12, v17 +; SI-NEXT: v_readfirstlane_b32 s13, v18 +; SI-NEXT: v_readfirstlane_b32 s10, v19 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: v_readfirstlane_b32 s6, v2 ; SI-NEXT: v_readfirstlane_b32 s7, v3 -; SI-NEXT: v_readfirstlane_b32 s6, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v5 -; SI-NEXT: s_cbranch_scc0 .LBB17_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s24 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: s_and_b64 s[24:25], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v5 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s72, s5, 16 +; SI-NEXT: s_lshr_b32 s73, s7, 16 +; SI-NEXT: s_lshr_b32 s74, s9, 16 +; SI-NEXT: s_lshr_b32 s75, s11, 16 +; SI-NEXT: s_lshr_b32 s76, s13, 16 +; SI-NEXT: s_lshr_b32 s77, s15, 16 +; SI-NEXT: s_lshr_b32 s78, s17, 16 +; SI-NEXT: s_lshr_b32 s79, s19, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 16 +; SI-NEXT: s_lshr_b32 s89, s23, 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[22:23], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 @@ -6004,193 +5791,135 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_lshr_b32 s5, s25, 16 -; SI-NEXT: s_lshr_b32 s26, s23, 16 -; SI-NEXT: s_lshr_b32 s27, s22, 16 -; SI-NEXT: s_lshr_b32 s28, s21, 16 -; SI-NEXT: s_lshr_b32 s29, s20, 16 -; SI-NEXT: s_lshr_b32 s40, s19, 16 -; SI-NEXT: s_lshr_b32 s41, s18, 16 -; SI-NEXT: s_lshr_b32 s42, s17, 16 -; SI-NEXT: s_lshr_b32 s43, s16, 16 -; SI-NEXT: s_lshr_b32 s44, s15, 16 -; SI-NEXT: s_lshr_b32 s45, s14, 16 -; SI-NEXT: s_lshr_b32 s46, s13, 16 -; SI-NEXT: s_lshr_b32 s47, s12, 16 -; SI-NEXT: s_lshr_b32 s56, s11, 16 -; SI-NEXT: s_lshr_b32 s57, s10, 16 -; SI-NEXT: s_lshr_b32 s58, s8, 16 -; SI-NEXT: s_lshr_b32 s59, s7, 16 -; SI-NEXT: s_lshr_b32 s60, s6, 16 -; SI-NEXT: s_lshr_b32 s61, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s72, s5, 16 +; SI-NEXT: s_lshr_b32 s73, s7, 16 +; SI-NEXT: s_lshr_b32 s74, s9, 16 +; SI-NEXT: s_lshr_b32 s75, s11, 16 +; SI-NEXT: s_lshr_b32 s76, s13, 16 +; SI-NEXT: s_lshr_b32 s77, s15, 16 +; SI-NEXT: s_lshr_b32 s78, s17, 16 +; SI-NEXT: s_lshr_b32 s79, s19, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 16 +; SI-NEXT: s_lshr_b32 s89, s23, 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[22:23], 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v2, v36, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v36 -; SI-NEXT: v_or_b32_e32 v5, v5, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v34 -; SI-NEXT: v_or_b32_e32 v7, v7, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; SI-NEXT: v_or_b32_e32 v9, v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 -; SI-NEXT: v_or_b32_e32 v11, v28, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; SI-NEXT: v_or_b32_e32 v13, v26, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 -; SI-NEXT: v_or_b32_e32 v15, v24, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v17, v22, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v6, v33, v6 -; SI-NEXT: v_or_b32_e32 v8, v31, v8 -; SI-NEXT: v_or_b32_e32 v10, v29, v10 -; SI-NEXT: v_or_b32_e32 v12, v27, v12 -; SI-NEXT: v_or_b32_e32 v14, v25, v14 -; SI-NEXT: v_or_b32_e32 v16, v23, v16 -; SI-NEXT: v_or_b32_e32 v18, v21, v18 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: s_lshl_b32 s25, s60, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s25 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s25, s89, 16 +; SI-NEXT: s_or_b32 s23, s23, s25 +; SI-NEXT: s_lshl_b32 s25, s58, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s25 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s25, s88, 16 +; SI-NEXT: s_or_b32 s21, s21, s25 +; SI-NEXT: s_lshl_b32 s25, s56, 16 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s18, s18, s25 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s25, s79, 16 +; SI-NEXT: s_or_b32 s19, s19, s25 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s25, s46, 16 +; SI-NEXT: s_or_b32 s16, s16, s25 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s25, s78, 16 +; SI-NEXT: s_or_b32 s17, s17, s25 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s25, s44, 16 +; SI-NEXT: s_or_b32 s14, s14, s25 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s25, s77, 16 +; SI-NEXT: s_or_b32 s15, s15, s25 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s25, s42, 16 +; SI-NEXT: s_or_b32 s12, s12, s25 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s25, s76, 16 +; SI-NEXT: s_or_b32 s13, s13, s25 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s25, s40, 16 +; SI-NEXT: s_or_b32 s10, s10, s25 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s25, s75, 16 +; SI-NEXT: s_or_b32 s11, s11, s25 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s25, s28, 16 +; SI-NEXT: s_or_b32 s8, s8, s25 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s25, s74, 16 +; SI-NEXT: s_or_b32 s9, s9, s25 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s25, s26, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_or_b32 s6, s6, s25 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s25, s73, 16 +; SI-NEXT: s_or_b32 s4, s4, s24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s24, s72, 16 +; SI-NEXT: s_or_b32 s7, s7, s25 +; SI-NEXT: s_or_b32 s5, s5, s24 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_mov_b32_e32 v1, s23 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: v_mov_b32_e32 v3, s21 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s19 +; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: v_mov_b32_e32 v7, s17 +; SI-NEXT: v_mov_b32_e32 v8, s14 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s12 +; SI-NEXT: v_mov_b32_e32 v11, s13 +; SI-NEXT: v_mov_b32_e32 v12, s10 +; SI-NEXT: v_mov_b32_e32 v13, s11 +; SI-NEXT: v_mov_b32_e32 v14, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v16, s6 +; SI-NEXT: v_mov_b32_e32 v17, s7 +; SI-NEXT: v_mov_b32_e32 v18, s4 +; SI-NEXT: v_mov_b32_e32 v19, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v20i32_to_v40f16_scalar: @@ -6745,86 +6474,7 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v40f16_to_v20i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_mov_b32_e32 v32, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -6841,103 +6491,112 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v43, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_mov_b32_e32 v34, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_mov_b32_e32 v35, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v36, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v11 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v9 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v7 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v54, v5 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v40, v3 +; SI-NEXT: v_mov_b32_e32 v41, v2 +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v43 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v60, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v44, v9 -; SI-NEXT: v_or_b32_e32 v10, v42, v10 -; SI-NEXT: v_or_b32_e32 v11, v40, v11 -; SI-NEXT: v_or_b32_e32 v12, v54, v12 -; SI-NEXT: v_or_b32_e32 v13, v52, v13 -; SI-NEXT: v_or_b32_e32 v14, v50, v14 -; SI-NEXT: v_or_b32_e32 v15, v48, v15 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v56 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v63 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -6950,25 +6609,50 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v36 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v35 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 @@ -6982,6 +6666,13 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 @@ -6990,10 +6681,10 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -7006,14 +6697,10 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -7022,143 +6709,149 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v54 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v48 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v34 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v54 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v49 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 @@ -7714,175 +7407,116 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-LABEL: bitcast_v40f16_to_v20i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s14 -; SI-NEXT: s_lshr_b32 s14, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 -; SI-NEXT: s_lshr_b32 s14, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s14 -; SI-NEXT: s_lshr_b32 s14, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s14 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_mov_b32_e32 v32, v5 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v34, v3 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_mov_b32_e32 v37, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v37 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_or_b32_e32 v5, v34, v5 -; SI-NEXT: v_or_b32_e32 v6, v33, v6 -; SI-NEXT: v_or_b32_e32 v7, v63, v7 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v55, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v51, v11 -; SI-NEXT: v_or_b32_e32 v12, v50, v12 -; SI-NEXT: v_or_b32_e32 v13, v48, v13 -; SI-NEXT: v_or_b32_e32 v14, v30, v14 -; SI-NEXT: v_or_b32_e32 v15, v28, v15 -; SI-NEXT: v_or_b32_e32 v16, v26, v16 -; SI-NEXT: v_or_b32_e32 v17, v24, v17 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -7895,11 +7529,10 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -7908,239 +7541,142 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v49 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v38 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v50 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v24 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v29 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v39 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: .LBB19_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v39, v44 -; SI-NEXT: v_mov_b32_e32 v44, v48 -; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v38, v45 -; SI-NEXT: v_mov_b32_e32 v45, v49 -; SI-NEXT: v_mov_b32_e32 v49, v20 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v37, v46 -; SI-NEXT: v_mov_b32_e32 v46, v50 -; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v36, v47 -; SI-NEXT: v_mov_b32_e32 v47, v51 -; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v56 -; SI-NEXT: v_mov_b32_e32 v56, v52 -; SI-NEXT: v_mov_b32_e32 v52, v24 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v34, v57 -; SI-NEXT: v_mov_b32_e32 v57, v53 -; SI-NEXT: v_mov_b32_e32 v53, v25 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_mov_b32_e32 v58, v54 -; SI-NEXT: v_mov_b32_e32 v54, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v59 -; SI-NEXT: v_mov_b32_e32 v59, v55 -; SI-NEXT: v_mov_b32_e32 v55, v27 -; SI-NEXT: v_mov_b32_e32 v40, v28 -; SI-NEXT: v_mov_b32_e32 v41, v29 -; SI-NEXT: v_mov_b32_e32 v42, v30 -; SI-NEXT: v_mov_b32_e32 v43, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v27, v55 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: v_mov_b32_e32 v59, v32 -; SI-NEXT: v_mov_b32_e32 v26, v54 -; SI-NEXT: v_mov_b32_e32 v54, v58 -; SI-NEXT: v_mov_b32_e32 v58, v33 -; SI-NEXT: v_mov_b32_e32 v25, v53 -; SI-NEXT: v_mov_b32_e32 v53, v57 -; SI-NEXT: v_mov_b32_e32 v57, v34 -; SI-NEXT: v_mov_b32_e32 v24, v52 -; SI-NEXT: v_mov_b32_e32 v52, v56 -; SI-NEXT: v_mov_b32_e32 v56, v35 -; SI-NEXT: v_mov_b32_e32 v23, v51 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v22, v50 -; SI-NEXT: v_mov_b32_e32 v50, v46 -; SI-NEXT: v_mov_b32_e32 v46, v37 -; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v45 -; SI-NEXT: v_mov_b32_e32 v45, v38 -; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v44 -; SI-NEXT: v_mov_b32_e32 v44, v39 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v30, v42 -; SI-NEXT: v_mov_b32_e32 v29, v41 -; SI-NEXT: v_mov_b32_e32 v28, v40 ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v40f16_to_v20i32_scalar: @@ -12986,327 +12522,156 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB32_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v31, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v31, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v53 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v49 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v50 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v27 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v31 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v25 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v14, v14, v22 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v16, v16, v21 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_or_b32_e32 v13, v13, v23 +; SI-NEXT: v_or_b32_e32 v15, v15, v22 +; SI-NEXT: v_or_b32_e32 v17, v17, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f32_to_v40f16: @@ -13715,314 +13080,174 @@ define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, s16 -; SI-NEXT: v_mov_b32_e32 v58, s17 -; SI-NEXT: v_mov_b32_e32 v57, s18 -; SI-NEXT: v_mov_b32_e32 v56, s19 -; SI-NEXT: v_mov_b32_e32 v47, s20 -; SI-NEXT: v_mov_b32_e32 v46, s21 -; SI-NEXT: v_mov_b32_e32 v45, s22 -; SI-NEXT: v_mov_b32_e32 v44, s23 -; SI-NEXT: v_mov_b32_e32 v43, s24 -; SI-NEXT: v_mov_b32_e32 v42, s25 -; SI-NEXT: v_mov_b32_e32 v40, s26 -; SI-NEXT: v_mov_b32_e32 v55, s27 -; SI-NEXT: v_mov_b32_e32 v54, s28 +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_mov_b32_e32 v21, s17 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v41, s29 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v59 +; SI-NEXT: v_lshr_b64 v[26:27], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[20:21], 16 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v6, 1.0, v59 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v58 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v57 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v56 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v47 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v46 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v45 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v44 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v43 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v42 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v40 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v55 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v54 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v41 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[26:27], v[4:5], 16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[27:28], v[2:3], 16 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_lshr_b64 v[28:29], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[16:17], 16 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[29:30], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 ; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v38 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v34 -; SI-NEXT: v_or_b32_e32 v7, v7, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; SI-NEXT: v_or_b32_e32 v9, v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 -; SI-NEXT: v_or_b32_e32 v11, v28, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; SI-NEXT: v_or_b32_e32 v13, v26, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 -; SI-NEXT: v_or_b32_e32 v15, v24, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 -; SI-NEXT: v_or_b32_e32 v17, v22, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v22 -; SI-NEXT: v_or_b32_e32 v5, v35, v5 -; SI-NEXT: v_or_b32_e32 v6, v33, v6 -; SI-NEXT: v_or_b32_e32 v8, v31, v8 -; SI-NEXT: v_or_b32_e32 v10, v29, v10 -; SI-NEXT: v_or_b32_e32 v12, v27, v12 -; SI-NEXT: v_or_b32_e32 v14, v25, v14 -; SI-NEXT: v_or_b32_e32 v16, v23, v16 -; SI-NEXT: v_or_b32_e32 v18, v21, v18 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v24, v20, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_or_b32_e32 v25, v7, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v20, v18, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_or_b32_e32 v21, v7, v18 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v22, v16, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v49 +; SI-NEXT: v_or_b32_e32 v23, v7, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v24 +; SI-NEXT: v_mov_b32_e32 v1, v25 +; SI-NEXT: v_mov_b32_e32 v2, v20 +; SI-NEXT: v_mov_b32_e32 v3, v21 +; SI-NEXT: v_mov_b32_e32 v4, v22 +; SI-NEXT: v_mov_b32_e32 v5, v23 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v20f32_to_v40f16_scalar: @@ -14619,86 +13844,7 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v40f16_to_v20f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_mov_b32_e32 v32, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -14715,103 +13861,112 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v43, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_mov_b32_e32 v34, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_mov_b32_e32 v35, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v36, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v11 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v9 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v7 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v54, v5 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v40, v3 +; SI-NEXT: v_mov_b32_e32 v41, v2 +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v43 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v60, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v44, v9 -; SI-NEXT: v_or_b32_e32 v10, v42, v10 -; SI-NEXT: v_or_b32_e32 v11, v40, v11 -; SI-NEXT: v_or_b32_e32 v12, v54, v12 -; SI-NEXT: v_or_b32_e32 v13, v52, v13 -; SI-NEXT: v_or_b32_e32 v14, v50, v14 -; SI-NEXT: v_or_b32_e32 v15, v48, v15 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v56 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v63 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -14824,25 +13979,50 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v36 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v35 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 @@ -14856,6 +14036,13 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 @@ -14864,10 +14051,10 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -14880,14 +14067,10 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -14896,143 +14079,149 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v54 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v48 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v34 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v54 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v49 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 @@ -15588,175 +14777,116 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-LABEL: bitcast_v40f16_to_v20f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s14 -; SI-NEXT: s_lshr_b32 s14, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 -; SI-NEXT: s_lshr_b32 s14, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s14 -; SI-NEXT: s_lshr_b32 s14, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s14 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_mov_b32_e32 v32, v5 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v34, v3 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_mov_b32_e32 v37, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v37 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_or_b32_e32 v5, v34, v5 -; SI-NEXT: v_or_b32_e32 v6, v33, v6 -; SI-NEXT: v_or_b32_e32 v7, v63, v7 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v55, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v51, v11 -; SI-NEXT: v_or_b32_e32 v12, v50, v12 -; SI-NEXT: v_or_b32_e32 v13, v48, v13 -; SI-NEXT: v_or_b32_e32 v14, v30, v14 -; SI-NEXT: v_or_b32_e32 v15, v28, v15 -; SI-NEXT: v_or_b32_e32 v16, v26, v16 -; SI-NEXT: v_or_b32_e32 v17, v24, v17 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -15769,11 +14899,10 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -15782,239 +14911,142 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v49 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v38 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v50 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v24 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v29 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v39 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: .LBB35_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v39, v44 -; SI-NEXT: v_mov_b32_e32 v44, v48 -; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v38, v45 -; SI-NEXT: v_mov_b32_e32 v45, v49 -; SI-NEXT: v_mov_b32_e32 v49, v20 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v37, v46 -; SI-NEXT: v_mov_b32_e32 v46, v50 -; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v36, v47 -; SI-NEXT: v_mov_b32_e32 v47, v51 -; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v56 -; SI-NEXT: v_mov_b32_e32 v56, v52 -; SI-NEXT: v_mov_b32_e32 v52, v24 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v34, v57 -; SI-NEXT: v_mov_b32_e32 v57, v53 -; SI-NEXT: v_mov_b32_e32 v53, v25 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_mov_b32_e32 v58, v54 -; SI-NEXT: v_mov_b32_e32 v54, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v59 -; SI-NEXT: v_mov_b32_e32 v59, v55 -; SI-NEXT: v_mov_b32_e32 v55, v27 -; SI-NEXT: v_mov_b32_e32 v40, v28 -; SI-NEXT: v_mov_b32_e32 v41, v29 -; SI-NEXT: v_mov_b32_e32 v42, v30 -; SI-NEXT: v_mov_b32_e32 v43, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v27, v55 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: v_mov_b32_e32 v59, v32 -; SI-NEXT: v_mov_b32_e32 v26, v54 -; SI-NEXT: v_mov_b32_e32 v54, v58 -; SI-NEXT: v_mov_b32_e32 v58, v33 -; SI-NEXT: v_mov_b32_e32 v25, v53 -; SI-NEXT: v_mov_b32_e32 v53, v57 -; SI-NEXT: v_mov_b32_e32 v57, v34 -; SI-NEXT: v_mov_b32_e32 v24, v52 -; SI-NEXT: v_mov_b32_e32 v52, v56 -; SI-NEXT: v_mov_b32_e32 v56, v35 -; SI-NEXT: v_mov_b32_e32 v23, v51 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v22, v50 -; SI-NEXT: v_mov_b32_e32 v50, v46 -; SI-NEXT: v_mov_b32_e32 v46, v37 -; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v45 -; SI-NEXT: v_mov_b32_e32 v45, v38 -; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v44 -; SI-NEXT: v_mov_b32_e32 v44, v39 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v30, v42 -; SI-NEXT: v_mov_b32_e32 v29, v41 -; SI-NEXT: v_mov_b32_e32 v28, v40 ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v40f16_to_v20f32_scalar: @@ -20162,146 +19194,50 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v28, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v31, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 @@ -20326,163 +19262,88 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v28, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v31, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v53 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v49 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v50 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v27 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v31 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v28 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v25 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v14, v14, v22 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v16, v16, v21 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v28 +; SI-NEXT: v_or_b32_e32 v7, v7, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_or_b32_e32 v13, v13, v23 +; SI-NEXT: v_or_b32_e32 v15, v15, v22 +; SI-NEXT: v_or_b32_e32 v17, v17, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i64_to_v40f16: @@ -20933,296 +19794,196 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v17, s26 ; SI-NEXT: v_mov_b32_e32 v18, s27 ; SI-NEXT: v_mov_b32_e32 v19, s28 -; SI-NEXT: v_readfirstlane_b32 s24, v7 +; SI-NEXT: v_readfirstlane_b32 s22, v7 ; SI-NEXT: v_mov_b32_e32 v7, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_readfirstlane_b32 s25, v8 -; SI-NEXT: v_readfirstlane_b32 s22, v9 -; SI-NEXT: v_readfirstlane_b32 s23, v10 -; SI-NEXT: v_readfirstlane_b32 s20, v11 -; SI-NEXT: v_readfirstlane_b32 s21, v12 -; SI-NEXT: v_readfirstlane_b32 s18, v13 -; SI-NEXT: v_readfirstlane_b32 s19, v14 -; SI-NEXT: v_readfirstlane_b32 s16, v15 -; SI-NEXT: v_readfirstlane_b32 s17, v16 -; SI-NEXT: v_readfirstlane_b32 s14, v17 -; SI-NEXT: v_readfirstlane_b32 s15, v18 -; SI-NEXT: v_readfirstlane_b32 s12, v19 -; SI-NEXT: v_readfirstlane_b32 s13, v7 -; SI-NEXT: v_readfirstlane_b32 s10, v0 -; SI-NEXT: v_readfirstlane_b32 s11, v1 -; SI-NEXT: v_readfirstlane_b32 s7, v2 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_readfirstlane_b32 s6, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v5 +; SI-NEXT: v_readfirstlane_b32 s23, v8 +; SI-NEXT: v_readfirstlane_b32 s20, v9 +; SI-NEXT: v_readfirstlane_b32 s21, v10 +; SI-NEXT: v_readfirstlane_b32 s18, v11 +; SI-NEXT: v_readfirstlane_b32 s19, v12 +; SI-NEXT: v_readfirstlane_b32 s16, v13 +; SI-NEXT: v_readfirstlane_b32 s17, v14 +; SI-NEXT: v_readfirstlane_b32 s14, v15 +; SI-NEXT: v_readfirstlane_b32 s15, v16 +; SI-NEXT: v_readfirstlane_b32 s12, v17 +; SI-NEXT: v_readfirstlane_b32 s13, v18 +; SI-NEXT: v_readfirstlane_b32 s10, v19 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: s_and_b64 s[24:25], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v5 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s24 +; SI-NEXT: s_lshr_b32 s72, s5, 16 +; SI-NEXT: s_lshr_b32 s73, s7, 16 +; SI-NEXT: s_lshr_b32 s74, s9, 16 +; SI-NEXT: s_lshr_b32 s75, s11, 16 +; SI-NEXT: s_lshr_b32 s76, s13, 16 +; SI-NEXT: s_lshr_b32 s77, s15, 16 +; SI-NEXT: s_lshr_b32 s78, s17, 16 +; SI-NEXT: s_lshr_b32 s79, s19, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 16 +; SI-NEXT: s_lshr_b32 s89, s23, 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[22:23], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s24, 3 -; SI-NEXT: s_addc_u32 s5, s25, 0 -; SI-NEXT: s_lshr_b32 s24, s4, 16 -; SI-NEXT: s_lshr_b32 s25, s5, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s26, s22, 16 -; SI-NEXT: s_lshr_b32 s27, s23, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s28, s20, 16 -; SI-NEXT: s_lshr_b32 s29, s21, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s40, s18, 16 -; SI-NEXT: s_lshr_b32 s41, s19, 16 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s42, s16, 16 -; SI-NEXT: s_lshr_b32 s43, s17, 16 -; SI-NEXT: s_add_u32 s14, s14, 3 -; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_lshr_b32 s44, s14, 16 -; SI-NEXT: s_lshr_b32 s45, s15, 16 -; SI-NEXT: s_add_u32 s12, s12, 3 -; SI-NEXT: s_addc_u32 s13, s13, 0 -; SI-NEXT: s_lshr_b32 s46, s12, 16 -; SI-NEXT: s_lshr_b32 s47, s13, 16 -; SI-NEXT: s_add_u32 s10, s10, 3 -; SI-NEXT: s_addc_u32 s11, s11, 0 -; SI-NEXT: s_lshr_b32 s56, s10, 16 -; SI-NEXT: s_lshr_b32 s57, s11, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_lshr_b32 s58, s7, 16 -; SI-NEXT: s_lshr_b32 s59, s8, 16 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_lshr_b32 s60, s6, 16 -; SI-NEXT: s_lshr_b32 s61, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s24 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s72, s5, 16 +; SI-NEXT: s_lshr_b32 s73, s7, 16 +; SI-NEXT: s_lshr_b32 s74, s9, 16 +; SI-NEXT: s_lshr_b32 s75, s11, 16 +; SI-NEXT: s_lshr_b32 s76, s13, 16 +; SI-NEXT: s_lshr_b32 s77, s15, 16 +; SI-NEXT: s_lshr_b32 s78, s17, 16 +; SI-NEXT: s_lshr_b32 s79, s19, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 16 +; SI-NEXT: s_lshr_b32 s89, s23, 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[22:23], 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: v_or_b32_e32 v2, v36, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v36 -; SI-NEXT: v_or_b32_e32 v5, v5, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v34 -; SI-NEXT: v_or_b32_e32 v7, v7, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; SI-NEXT: v_or_b32_e32 v9, v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 -; SI-NEXT: v_or_b32_e32 v11, v28, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; SI-NEXT: v_or_b32_e32 v13, v26, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 -; SI-NEXT: v_or_b32_e32 v15, v24, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v17, v22, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v22 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v6, v33, v6 -; SI-NEXT: v_or_b32_e32 v8, v31, v8 -; SI-NEXT: v_or_b32_e32 v10, v29, v10 -; SI-NEXT: v_or_b32_e32 v12, v27, v12 -; SI-NEXT: v_or_b32_e32 v14, v25, v14 -; SI-NEXT: v_or_b32_e32 v16, v23, v16 -; SI-NEXT: v_or_b32_e32 v18, v21, v18 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: s_lshl_b32 s25, s60, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s25 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s25, s89, 16 +; SI-NEXT: s_or_b32 s23, s23, s25 +; SI-NEXT: s_lshl_b32 s25, s58, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s25 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s25, s88, 16 +; SI-NEXT: s_or_b32 s21, s21, s25 +; SI-NEXT: s_lshl_b32 s25, s56, 16 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s18, s18, s25 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s25, s79, 16 +; SI-NEXT: s_or_b32 s19, s19, s25 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s25, s46, 16 +; SI-NEXT: s_or_b32 s16, s16, s25 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s25, s78, 16 +; SI-NEXT: s_or_b32 s17, s17, s25 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s25, s44, 16 +; SI-NEXT: s_or_b32 s14, s14, s25 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s25, s77, 16 +; SI-NEXT: s_or_b32 s15, s15, s25 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s25, s42, 16 +; SI-NEXT: s_or_b32 s12, s12, s25 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s25, s76, 16 +; SI-NEXT: s_or_b32 s13, s13, s25 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s25, s40, 16 +; SI-NEXT: s_or_b32 s10, s10, s25 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s25, s75, 16 +; SI-NEXT: s_or_b32 s11, s11, s25 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s25, s28, 16 +; SI-NEXT: s_or_b32 s8, s8, s25 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s25, s74, 16 +; SI-NEXT: s_or_b32 s9, s9, s25 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s25, s26, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_or_b32 s6, s6, s25 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s25, s73, 16 +; SI-NEXT: s_or_b32 s4, s4, s24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s24, s72, 16 +; SI-NEXT: s_or_b32 s7, s7, s25 +; SI-NEXT: s_or_b32 s5, s5, s24 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_mov_b32_e32 v1, s23 +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: v_mov_b32_e32 v3, s21 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s19 +; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: v_mov_b32_e32 v7, s17 +; SI-NEXT: v_mov_b32_e32 v8, s14 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s12 +; SI-NEXT: v_mov_b32_e32 v11, s13 +; SI-NEXT: v_mov_b32_e32 v12, s10 +; SI-NEXT: v_mov_b32_e32 v13, s11 +; SI-NEXT: v_mov_b32_e32 v14, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v16, s6 +; SI-NEXT: v_mov_b32_e32 v17, s7 +; SI-NEXT: v_mov_b32_e32 v18, s4 +; SI-NEXT: v_mov_b32_e32 v19, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v10i64_to_v40f16_scalar: @@ -21777,86 +20538,7 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v40f16_to_v10i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_mov_b32_e32 v32, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -21873,103 +20555,112 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v43, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_mov_b32_e32 v34, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_mov_b32_e32 v35, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v36, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v11 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v9 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v7 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v54, v5 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v40, v3 +; SI-NEXT: v_mov_b32_e32 v41, v2 +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v43 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v60, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v44, v9 -; SI-NEXT: v_or_b32_e32 v10, v42, v10 -; SI-NEXT: v_or_b32_e32 v11, v40, v11 -; SI-NEXT: v_or_b32_e32 v12, v54, v12 -; SI-NEXT: v_or_b32_e32 v13, v52, v13 -; SI-NEXT: v_or_b32_e32 v14, v50, v14 -; SI-NEXT: v_or_b32_e32 v15, v48, v15 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v56 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v63 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -21982,25 +20673,50 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v36 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v35 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 @@ -22014,6 +20730,13 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 @@ -22022,10 +20745,10 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -22038,14 +20761,10 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -22054,143 +20773,149 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v54 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v48 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v34 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v54 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v49 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 @@ -22746,175 +21471,116 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-LABEL: bitcast_v40f16_to_v10i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s14 -; SI-NEXT: s_lshr_b32 s14, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 -; SI-NEXT: s_lshr_b32 s14, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s14 -; SI-NEXT: s_lshr_b32 s14, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s14 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_mov_b32_e32 v32, v5 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v34, v3 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_mov_b32_e32 v37, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v37 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_or_b32_e32 v5, v34, v5 -; SI-NEXT: v_or_b32_e32 v6, v33, v6 -; SI-NEXT: v_or_b32_e32 v7, v63, v7 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v55, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v51, v11 -; SI-NEXT: v_or_b32_e32 v12, v50, v12 -; SI-NEXT: v_or_b32_e32 v13, v48, v13 -; SI-NEXT: v_or_b32_e32 v14, v30, v14 -; SI-NEXT: v_or_b32_e32 v15, v28, v15 -; SI-NEXT: v_or_b32_e32 v16, v26, v16 -; SI-NEXT: v_or_b32_e32 v17, v24, v17 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -22927,11 +21593,10 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -22940,239 +21605,142 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v49 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v38 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v50 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v24 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v29 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v39 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v39, v44 -; SI-NEXT: v_mov_b32_e32 v44, v48 -; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v38, v45 -; SI-NEXT: v_mov_b32_e32 v45, v49 -; SI-NEXT: v_mov_b32_e32 v49, v20 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v37, v46 -; SI-NEXT: v_mov_b32_e32 v46, v50 -; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v36, v47 -; SI-NEXT: v_mov_b32_e32 v47, v51 -; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v56 -; SI-NEXT: v_mov_b32_e32 v56, v52 -; SI-NEXT: v_mov_b32_e32 v52, v24 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v34, v57 -; SI-NEXT: v_mov_b32_e32 v57, v53 -; SI-NEXT: v_mov_b32_e32 v53, v25 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_mov_b32_e32 v58, v54 -; SI-NEXT: v_mov_b32_e32 v54, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v59 -; SI-NEXT: v_mov_b32_e32 v59, v55 -; SI-NEXT: v_mov_b32_e32 v55, v27 -; SI-NEXT: v_mov_b32_e32 v40, v28 -; SI-NEXT: v_mov_b32_e32 v41, v29 -; SI-NEXT: v_mov_b32_e32 v42, v30 -; SI-NEXT: v_mov_b32_e32 v43, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v27, v55 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: v_mov_b32_e32 v59, v32 -; SI-NEXT: v_mov_b32_e32 v26, v54 -; SI-NEXT: v_mov_b32_e32 v54, v58 -; SI-NEXT: v_mov_b32_e32 v58, v33 -; SI-NEXT: v_mov_b32_e32 v25, v53 -; SI-NEXT: v_mov_b32_e32 v53, v57 -; SI-NEXT: v_mov_b32_e32 v57, v34 -; SI-NEXT: v_mov_b32_e32 v24, v52 -; SI-NEXT: v_mov_b32_e32 v52, v56 -; SI-NEXT: v_mov_b32_e32 v56, v35 -; SI-NEXT: v_mov_b32_e32 v23, v51 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v22, v50 -; SI-NEXT: v_mov_b32_e32 v50, v46 -; SI-NEXT: v_mov_b32_e32 v46, v37 -; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v45 -; SI-NEXT: v_mov_b32_e32 v45, v38 -; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v44 -; SI-NEXT: v_mov_b32_e32 v44, v39 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v30, v42 -; SI-NEXT: v_mov_b32_e32 v29, v41 -; SI-NEXT: v_mov_b32_e32 v28, v40 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v40f16_to_v10i64_scalar: @@ -26548,136 +25116,50 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v32, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v35, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 @@ -26687,168 +25169,93 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: .LBB52_4: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v53 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v49 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v50 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v27 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_alignbit_b32 v20, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v32, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v35, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v25 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_or_b32_e32 v14, v14, v22 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v16, v16, v21 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_or_b32_e32 v5, v5, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_or_b32_e32 v13, v13, v23 +; SI-NEXT: v_or_b32_e32 v15, v15, v22 +; SI-NEXT: v_or_b32_e32 v17, v17, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f64_to_v40f16: @@ -27237,309 +25644,164 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_mov_b32_e32 v18, s16 -; SI-NEXT: v_mov_b32_e32 v19, s17 -; SI-NEXT: v_mov_b32_e32 v16, s18 -; SI-NEXT: v_mov_b32_e32 v17, s19 -; SI-NEXT: v_mov_b32_e32 v14, s20 -; SI-NEXT: v_mov_b32_e32 v15, s21 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s23 -; SI-NEXT: v_mov_b32_e32 v10, s24 -; SI-NEXT: v_mov_b32_e32 v11, s25 -; SI-NEXT: v_mov_b32_e32 v8, s26 -; SI-NEXT: v_mov_b32_e32 v9, s27 -; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_mov_b32_e32 v21, s17 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v7, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v18 +; SI-NEXT: v_lshr_b64 v[26:27], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[20:21], 16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_lshr_b64 v[26:27], v[4:5], 16 ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[27:28], v[2:3], 16 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_lshr_b64 v[28:29], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v53 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v24, v20, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_or_b32_e32 v25, v7, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v20, v18, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_or_b32_e32 v21, v7, v18 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v22, v16, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v49 +; SI-NEXT: v_or_b32_e32 v23, v7, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v49 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v50 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v33 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v27 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 +; SI-NEXT: v_or_b32_e32 v7, v7, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v24 +; SI-NEXT: v_mov_b32_e32 v1, v25 +; SI-NEXT: v_mov_b32_e32 v2, v20 +; SI-NEXT: v_mov_b32_e32 v3, v21 +; SI-NEXT: v_mov_b32_e32 v4, v22 +; SI-NEXT: v_mov_b32_e32 v5, v23 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v10f64_to_v40f16_scalar: @@ -28116,86 +26378,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v40f16_to_v10f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_mov_b32_e32 v32, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -28212,103 +26395,112 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v43, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_mov_b32_e32 v34, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_mov_b32_e32 v35, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v36, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v11 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v9 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v7 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v54, v5 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v40, v3 +; SI-NEXT: v_mov_b32_e32 v41, v2 +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v43 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v60, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_or_b32_e32 v8, v46, v8 -; SI-NEXT: v_or_b32_e32 v9, v44, v9 -; SI-NEXT: v_or_b32_e32 v10, v42, v10 -; SI-NEXT: v_or_b32_e32 v11, v40, v11 -; SI-NEXT: v_or_b32_e32 v12, v54, v12 -; SI-NEXT: v_or_b32_e32 v13, v52, v13 -; SI-NEXT: v_or_b32_e32 v14, v50, v14 -; SI-NEXT: v_or_b32_e32 v15, v48, v15 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v56 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v63 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -28321,25 +26513,50 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v36 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v35 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 @@ -28353,6 +26570,13 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 @@ -28361,10 +26585,10 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -28377,14 +26601,10 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -28393,143 +26613,149 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v54 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v48 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v37 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v34 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v54 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v49 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 @@ -29085,175 +27311,116 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-LABEL: bitcast_v40f16_to_v10f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s14 -; SI-NEXT: s_lshr_b32 s14, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 -; SI-NEXT: s_lshr_b32 s14, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s14 -; SI-NEXT: s_lshr_b32 s14, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s14 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s27 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_mov_b32_e32 v32, v5 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v34, v3 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_mov_b32_e32 v37, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v37 ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_or_b32_e32 v5, v34, v5 -; SI-NEXT: v_or_b32_e32 v6, v33, v6 -; SI-NEXT: v_or_b32_e32 v7, v63, v7 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v55, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v51, v11 -; SI-NEXT: v_or_b32_e32 v12, v50, v12 -; SI-NEXT: v_or_b32_e32 v13, v48, v13 -; SI-NEXT: v_or_b32_e32 v14, v30, v14 -; SI-NEXT: v_or_b32_e32 v15, v28, v15 -; SI-NEXT: v_or_b32_e32 v16, v26, v16 -; SI-NEXT: v_or_b32_e32 v17, v24, v17 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -29266,11 +27433,10 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -29279,239 +27445,142 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v49 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v38 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v50 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v24 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v29 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v39 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v39, v44 -; SI-NEXT: v_mov_b32_e32 v44, v48 -; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v38, v45 -; SI-NEXT: v_mov_b32_e32 v45, v49 -; SI-NEXT: v_mov_b32_e32 v49, v20 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v37, v46 -; SI-NEXT: v_mov_b32_e32 v46, v50 -; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v36, v47 -; SI-NEXT: v_mov_b32_e32 v47, v51 -; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v56 -; SI-NEXT: v_mov_b32_e32 v56, v52 -; SI-NEXT: v_mov_b32_e32 v52, v24 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v34, v57 -; SI-NEXT: v_mov_b32_e32 v57, v53 -; SI-NEXT: v_mov_b32_e32 v53, v25 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v33, v58 -; SI-NEXT: v_mov_b32_e32 v58, v54 -; SI-NEXT: v_mov_b32_e32 v54, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v59 -; SI-NEXT: v_mov_b32_e32 v59, v55 -; SI-NEXT: v_mov_b32_e32 v55, v27 -; SI-NEXT: v_mov_b32_e32 v40, v28 -; SI-NEXT: v_mov_b32_e32 v41, v29 -; SI-NEXT: v_mov_b32_e32 v42, v30 -; SI-NEXT: v_mov_b32_e32 v43, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v27, v55 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: v_mov_b32_e32 v59, v32 -; SI-NEXT: v_mov_b32_e32 v26, v54 -; SI-NEXT: v_mov_b32_e32 v54, v58 -; SI-NEXT: v_mov_b32_e32 v58, v33 -; SI-NEXT: v_mov_b32_e32 v25, v53 -; SI-NEXT: v_mov_b32_e32 v53, v57 -; SI-NEXT: v_mov_b32_e32 v57, v34 -; SI-NEXT: v_mov_b32_e32 v24, v52 -; SI-NEXT: v_mov_b32_e32 v52, v56 -; SI-NEXT: v_mov_b32_e32 v56, v35 -; SI-NEXT: v_mov_b32_e32 v23, v51 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v22, v50 -; SI-NEXT: v_mov_b32_e32 v50, v46 -; SI-NEXT: v_mov_b32_e32 v46, v37 -; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v45 -; SI-NEXT: v_mov_b32_e32 v45, v38 -; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v44 -; SI-NEXT: v_mov_b32_e32 v44, v39 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v30, v42 -; SI-NEXT: v_mov_b32_e32 v29, v41 -; SI-NEXT: v_mov_b32_e32 v28, v40 ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v40f16_to_v10f64_scalar: @@ -30018,17 +28087,57 @@ end: define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v40f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -30045,132 +28154,141 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v0 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v25 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v53, v1, v63 +; SI-NEXT: v_alignbit_b32 v1, v53, v60, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v51, v1, v40 +; SI-NEXT: v_alignbit_b32 v1, v51, v61, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v49, v1, v42 +; SI-NEXT: v_alignbit_b32 v1, v49, v62, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v38, v1, v45 +; SI-NEXT: v_alignbit_b32 v1, v38, v54, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v36, v1, v46 +; SI-NEXT: v_alignbit_b32 v1, v36, v41, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v59, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v33, v1, v57 +; SI-NEXT: v_or_b32_e32 v56, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_alignbit_b32 v1, v33, v44, 16 +; SI-NEXT: v_or_b32_e32 v43, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v55, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v30, v1, v29 +; SI-NEXT: v_or_b32_e32 v0, v0, v41 +; SI-NEXT: v_alignbit_b32 v1, v30, v47, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v0, v0, v44 +; SI-NEXT: v_or_b32_e32 v27, v1, v35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_alignbit_b32 v1, v27, v58, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v25, v1, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: v_alignbit_b32 v1, v25, v32, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 +; SI-NEXT: v_or_b32_e32 v20, v1, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_alignbit_b32 v1, v20, v39, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 @@ -30178,281 +28296,214 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v39, v18 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v18, v52, v18 +; SI-NEXT: v_or_b32_e32 v16, v32, v16 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v16, v50, v16 +; SI-NEXT: v_or_b32_e32 v14, v58, v14 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v14, v35, v14 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v60 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v13 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v12, v29, v12 +; SI-NEXT: v_or_b32_e32 v10, v44, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v6, v54, v6 +; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v8, v41, v8 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v42, v4 +; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v2 +; SI-NEXT: v_alignbit_b32 v0, v53, v59, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v6, v45, v6 +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_alignbit_b32 v0, v51, v56, 16 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v0, v49, v43, 16 +; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_alignbit_b32 v0, v38, v55, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v36, v10, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v33, v12, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v30, v14, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v27, v16, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v25, v18, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v20, v21, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v25 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v27 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v43 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v55 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v51 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v55 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v42 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v37 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v47 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v49 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v57 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -30469,13 +28520,44 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v53 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40i16_to_v40f16: @@ -30874,286 +28956,403 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-LABEL: bitcast_v40i16_to_v40f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v20, s30, 0 +; SI-NEXT: v_writelane_b32 v20, s31, 1 +; SI-NEXT: v_writelane_b32 v20, s34, 2 +; SI-NEXT: v_writelane_b32 v20, s35, 3 +; SI-NEXT: v_writelane_b32 v20, s36, 4 +; SI-NEXT: v_writelane_b32 v20, s37, 5 +; SI-NEXT: v_writelane_b32 v20, s38, 6 +; SI-NEXT: v_writelane_b32 v20, s39, 7 +; SI-NEXT: v_writelane_b32 v20, s48, 8 +; SI-NEXT: v_writelane_b32 v20, s49, 9 +; SI-NEXT: v_writelane_b32 v20, s50, 10 +; SI-NEXT: v_writelane_b32 v20, s51, 11 +; SI-NEXT: v_writelane_b32 v20, s52, 12 +; SI-NEXT: v_writelane_b32 v20, s53, 13 +; SI-NEXT: v_writelane_b32 v20, s54, 14 +; SI-NEXT: v_writelane_b32 v20, s55, 15 +; SI-NEXT: v_writelane_b32 v20, s64, 16 +; SI-NEXT: v_writelane_b32 v20, s65, 17 +; SI-NEXT: v_writelane_b32 v20, s66, 18 +; SI-NEXT: v_writelane_b32 v20, s67, 19 +; SI-NEXT: v_writelane_b32 v20, s68, 20 +; SI-NEXT: v_writelane_b32 v20, s69, 21 +; SI-NEXT: v_writelane_b32 v20, s70, 22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: s_lshr_b32 s36, s29, 16 +; SI-NEXT: s_lshr_b32 s54, s28, 16 +; SI-NEXT: s_lshr_b32 s35, s27, 16 +; SI-NEXT: s_lshr_b32 s53, s26, 16 +; SI-NEXT: s_lshr_b32 s34, s25, 16 +; SI-NEXT: s_lshr_b32 s52, s24, 16 +; SI-NEXT: s_lshr_b32 s31, s23, 16 +; SI-NEXT: s_lshr_b32 s51, s22, 16 +; SI-NEXT: s_lshr_b32 s30, s21, 16 +; SI-NEXT: s_lshr_b32 s50, s20, 16 +; SI-NEXT: s_lshr_b32 s95, s19, 16 +; SI-NEXT: s_lshr_b32 s49, s18, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 16 +; SI-NEXT: s_lshr_b32 s48, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_writelane_b32 v20, s71, 23 +; SI-NEXT: v_readfirstlane_b32 s69, v5 +; SI-NEXT: v_readfirstlane_b32 s70, v4 +; SI-NEXT: v_readfirstlane_b32 s65, v3 +; SI-NEXT: v_readfirstlane_b32 s67, v2 +; SI-NEXT: v_readfirstlane_b32 s55, v1 +; SI-NEXT: v_readfirstlane_b32 s64, v0 +; SI-NEXT: v_readfirstlane_b32 s39, v7 +; SI-NEXT: v_readfirstlane_b32 s71, v8 +; SI-NEXT: v_readfirstlane_b32 s38, v9 +; SI-NEXT: v_readfirstlane_b32 s68, v10 +; SI-NEXT: v_readfirstlane_b32 s37, v11 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 +; SI-NEXT: v_readfirstlane_b32 s66, v12 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v42 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s94, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s95, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s30, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s31, 16 +; SI-NEXT: s_or_b32 s15, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s34, 16 +; SI-NEXT: s_or_b32 s13, s5, s7 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s7, s35, 16 +; SI-NEXT: s_or_b32 s11, s5, s7 +; SI-NEXT: s_and_b32 s5, s29, 0xffff +; SI-NEXT: s_lshl_b32 s7, s36, 16 +; SI-NEXT: s_or_b32 s9, s5, s7 +; SI-NEXT: s_and_b32 s5, s55, 0xffff +; SI-NEXT: s_lshl_b32 s7, s37, 16 +; SI-NEXT: s_or_b32 s7, s5, s7 +; SI-NEXT: s_and_b32 s5, s65, 0xffff +; SI-NEXT: s_lshl_b32 s45, s38, 16 +; SI-NEXT: s_and_b32 s44, s16, 0xffff +; SI-NEXT: s_lshl_b32 s46, s48, 16 +; SI-NEXT: s_or_b32 s5, s5, s45 +; SI-NEXT: s_and_b32 s45, s69, 0xffff +; SI-NEXT: s_lshl_b32 s56, s39, 16 +; SI-NEXT: s_lshl_b32 s42, s49, 16 +; SI-NEXT: s_or_b32 s91, s45, s56 +; SI-NEXT: s_or_b32 s44, s44, s46 +; SI-NEXT: s_lshr_b64 s[56:57], s[46:47], 16 +; SI-NEXT: s_and_b32 s46, s18, 0xffff +; SI-NEXT: s_lshl_b32 s40, s50, 16 +; SI-NEXT: s_or_b32 s46, s46, s42 +; SI-NEXT: s_lshr_b64 s[58:59], s[42:43], 16 +; SI-NEXT: s_and_b32 s42, s20, 0xffff +; SI-NEXT: s_lshl_b32 s14, s51, 16 +; SI-NEXT: s_or_b32 s42, s42, s40 +; SI-NEXT: s_lshr_b64 s[60:61], s[40:41], 16 +; SI-NEXT: s_and_b32 s40, s22, 0xffff +; SI-NEXT: s_lshl_b32 s12, s52, 16 +; SI-NEXT: s_or_b32 s40, s40, s14 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; SI-NEXT: s_and_b32 s14, s24, 0xffff +; SI-NEXT: s_lshl_b32 s10, s53, 16 +; SI-NEXT: s_or_b32 s14, s14, s12 +; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 16 +; SI-NEXT: s_and_b32 s12, s26, 0xffff +; SI-NEXT: s_lshl_b32 s8, s54, 16 +; SI-NEXT: s_or_b32 s12, s12, s10 +; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 16 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s6, s66, 16 +; SI-NEXT: s_or_b32 s10, s10, s8 +; SI-NEXT: s_lshr_b64 s[76:77], s[8:9], 16 +; SI-NEXT: s_and_b32 s8, s64, 0xffff +; SI-NEXT: s_lshl_b32 s4, s68, 16 +; SI-NEXT: s_or_b32 s8, s8, s6 +; SI-NEXT: s_lshr_b64 s[78:79], s[6:7], 16 +; SI-NEXT: s_and_b32 s6, s67, 0xffff +; SI-NEXT: s_lshl_b32 s90, s71, 16 +; SI-NEXT: s_or_b32 s6, s6, s4 +; SI-NEXT: s_lshr_b64 s[88:89], s[4:5], 16 +; SI-NEXT: s_and_b32 s4, s70, 0xffff +; SI-NEXT: s_mov_b32 s45, s47 +; SI-NEXT: s_mov_b32 s47, s43 +; SI-NEXT: s_mov_b32 s43, s41 +; SI-NEXT: s_mov_b32 s41, s15 +; SI-NEXT: s_mov_b32 s15, s13 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s11, s9 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s7, s5 +; SI-NEXT: s_or_b32 s4, s4, s90 +; SI-NEXT: s_mov_b32 s5, s91 +; SI-NEXT: s_lshr_b64 s[90:91], s[90:91], 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v42 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s70, s70, 3 +; SI-NEXT: s_and_b32 s4, s70, 0xffff +; SI-NEXT: s_lshl_b32 s5, s71, 16 +; SI-NEXT: s_add_i32 s69, s69, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s69, 0xffff +; SI-NEXT: s_lshl_b32 s6, s39, 16 +; SI-NEXT: s_add_i32 s67, s67, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s67, 0xffff +; SI-NEXT: s_lshl_b32 s7, s68, 16 +; SI-NEXT: s_add_i32 s65, s65, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s65, 0xffff +; SI-NEXT: s_lshl_b32 s8, s38, 16 +; SI-NEXT: s_add_i32 s64, s64, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s64, 0xffff +; SI-NEXT: s_lshl_b32 s9, s66, 16 +; SI-NEXT: s_add_i32 s55, s55, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s55, 0xffff +; SI-NEXT: s_lshl_b32 s10, s37, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s54, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s29, 0xffff +; SI-NEXT: s_lshl_b32 s12, s36, 16 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s26, 0xffff +; SI-NEXT: s_lshl_b32 s13, s53, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s27, 0xffff +; SI-NEXT: s_lshl_b32 s14, s35, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s24, 0xffff +; SI-NEXT: s_lshl_b32 s15, s52, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s25, 0xffff +; SI-NEXT: s_lshl_b32 s24, s34, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_or_b32 s15, s24, s15 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s24, s51, 16 +; SI-NEXT: s_or_b32 s22, s24, s22 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s40, s22, 0x30000 +; SI-NEXT: s_and_b32 s22, s23, 0xffff +; SI-NEXT: s_lshl_b32 s23, s31, 16 +; SI-NEXT: s_or_b32 s22, s23, s22 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s41, s22, 0x30000 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s22, s50, 16 +; SI-NEXT: s_or_b32 s20, s22, s20 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s42, s20, 0x30000 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s30, 16 +; SI-NEXT: s_or_b32 s20, s21, s20 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s43, s20, 0x30000 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s49, 16 +; SI-NEXT: s_or_b32 s18, s20, s18 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s46, s18, 0x30000 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s95, 16 +; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: s_add_i32 s47, s18, 0x30000 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s48, 16 +; SI-NEXT: s_or_b32 s16, s18, s16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s44, s16, 0x30000 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s94, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s45, s16, 0x30000 +; SI-NEXT: s_lshr_b64 s[56:57], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s94, s45, 16 +; SI-NEXT: s_lshr_b32 s95, s47, 16 +; SI-NEXT: s_lshr_b32 s30, s43, 16 +; SI-NEXT: s_lshr_b32 s31, s41, 16 +; SI-NEXT: s_lshr_b32 s34, s15, 16 +; SI-NEXT: s_lshr_b32 s35, s13, 16 +; SI-NEXT: s_lshr_b32 s36, s11, 16 +; SI-NEXT: s_lshr_b32 s37, s9, 16 +; SI-NEXT: s_lshr_b32 s38, s7, 16 +; SI-NEXT: s_lshr_b32 s39, s5, 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v25 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v22 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v37 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v39 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v49 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v34 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v51 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v35 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_and_b32 s16, s44, 0xffff +; SI-NEXT: s_lshl_b32 s17, s56, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s45, 0xffff +; SI-NEXT: s_lshl_b32 s18, s94, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s46, 0xffff +; SI-NEXT: s_lshl_b32 s19, s58, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s47, 0xffff +; SI-NEXT: s_lshl_b32 s20, s95, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s42, 0xffff +; SI-NEXT: s_lshl_b32 s21, s60, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_and_b32 s21, s43, 0xffff +; SI-NEXT: s_lshl_b32 s22, s30, 16 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_and_b32 s22, s40, 0xffff +; SI-NEXT: s_lshl_b32 s23, s62, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_and_b32 s23, s41, 0xffff +; SI-NEXT: s_lshl_b32 s24, s31, 16 +; SI-NEXT: s_or_b32 s23, s23, s24 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s24, s72, 16 +; SI-NEXT: s_or_b32 s14, s14, s24 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s24, s34, 16 +; SI-NEXT: s_or_b32 s15, s15, s24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s24, s74, 16 +; SI-NEXT: s_or_b32 s12, s12, s24 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s24, s35, 16 +; SI-NEXT: s_or_b32 s13, s13, s24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s24, s76, 16 +; SI-NEXT: s_or_b32 s10, s10, s24 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s24, s36, 16 +; SI-NEXT: s_or_b32 s11, s11, s24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s24, s78, 16 +; SI-NEXT: s_or_b32 s8, s8, s24 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s24, s37, 16 +; SI-NEXT: s_or_b32 s9, s9, s24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s24, s88, 16 +; SI-NEXT: s_or_b32 s6, s6, s24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s24, s38, 16 +; SI-NEXT: s_or_b32 s7, s7, s24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s24, s90, 16 +; SI-NEXT: s_or_b32 s4, s4, s24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s24, s39, 16 +; SI-NEXT: s_or_b32 s5, s5, s24 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s14 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s12 +; SI-NEXT: v_mov_b32_e32 v11, s13 +; SI-NEXT: v_mov_b32_e32 v12, s10 +; SI-NEXT: v_mov_b32_e32 v13, s11 +; SI-NEXT: v_mov_b32_e32 v14, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v16, s6 +; SI-NEXT: v_mov_b32_e32 v17, s7 +; SI-NEXT: v_mov_b32_e32 v18, s4 +; SI-NEXT: v_mov_b32_e32 v19, s5 +; SI-NEXT: v_readlane_b32 s71, v20, 23 +; SI-NEXT: v_readlane_b32 s70, v20, 22 +; SI-NEXT: v_readlane_b32 s69, v20, 21 +; SI-NEXT: v_readlane_b32 s68, v20, 20 +; SI-NEXT: v_readlane_b32 s67, v20, 19 +; SI-NEXT: v_readlane_b32 s66, v20, 18 +; SI-NEXT: v_readlane_b32 s65, v20, 17 +; SI-NEXT: v_readlane_b32 s64, v20, 16 +; SI-NEXT: v_readlane_b32 s55, v20, 15 +; SI-NEXT: v_readlane_b32 s54, v20, 14 +; SI-NEXT: v_readlane_b32 s53, v20, 13 +; SI-NEXT: v_readlane_b32 s52, v20, 12 +; SI-NEXT: v_readlane_b32 s51, v20, 11 +; SI-NEXT: v_readlane_b32 s50, v20, 10 +; SI-NEXT: v_readlane_b32 s49, v20, 9 +; SI-NEXT: v_readlane_b32 s48, v20, 8 +; SI-NEXT: v_readlane_b32 s39, v20, 7 +; SI-NEXT: v_readlane_b32 s38, v20, 6 +; SI-NEXT: v_readlane_b32 s37, v20, 5 +; SI-NEXT: v_readlane_b32 s36, v20, 4 +; SI-NEXT: v_readlane_b32 s35, v20, 3 +; SI-NEXT: v_readlane_b32 s34, v20, 2 +; SI-NEXT: v_readlane_b32 s31, v20, 1 +; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v40i16_to_v40f16_scalar: @@ -31803,344 +30002,264 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v40f16_to_v40i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v49 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v38 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v35 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_or_b32_e32 v19, v19, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_or_b32_e32 v22, v22, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v17, v17, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v18, v18, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_or_b32_e32 v23, v23, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v13, v13, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v25, v25, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_or_b32_e32 v12, v12, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v9, v9, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_or_b32_e32 v26, v26, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_or_b32_e32 v7, v7, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v28, v28, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_or_b32_e32 v6, v6, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_or_b32_e32 v30, v30, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v5, v5, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v38 -; SI-NEXT: v_or_b32_e32 v2, v2, v32 -; SI-NEXT: v_or_b32_e32 v0, v0, v20 -; SI-NEXT: v_or_b32_e32 v29, v29, v39 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v38 ; SI-NEXT: v_or_b32_e32 v4, v4, v37 -; SI-NEXT: v_or_b32_e32 v8, v8, v36 -; SI-NEXT: v_or_b32_e32 v27, v27, v49 -; SI-NEXT: v_or_b32_e32 v10, v10, v34 -; SI-NEXT: v_or_b32_e32 v14, v14, v33 -; SI-NEXT: v_or_b32_e32 v24, v24, v50 -; SI-NEXT: v_or_b32_e32 v16, v16, v31 -; SI-NEXT: v_or_b32_e32 v21, v21, v51 -; SI-NEXT: v_alignbit_b32 v48, v2, v20, 16 -; SI-NEXT: v_alignbit_b32 v39, v30, v39, 16 -; SI-NEXT: v_alignbit_b32 v38, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v37, v28, v36, 16 -; SI-NEXT: v_alignbit_b32 v36, v26, v49, 16 -; SI-NEXT: v_alignbit_b32 v35, v12, v34, 16 -; SI-NEXT: v_alignbit_b32 v34, v25, v33, 16 -; SI-NEXT: v_alignbit_b32 v33, v23, v50, 16 -; SI-NEXT: v_alignbit_b32 v32, v18, v31, 16 -; SI-NEXT: v_alignbit_b32 v31, v22, v51, 16 +; SI-NEXT: v_or_b32_e32 v6, v6, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_or_b32_e32 v14, v14, v29 +; SI-NEXT: v_or_b32_e32 v16, v16, v26 +; SI-NEXT: v_or_b32_e32 v18, v18, v24 +; SI-NEXT: v_alignbit_b32 v39, v1, v39, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v38, 16 +; SI-NEXT: v_alignbit_b32 v37, v5, v37, 16 +; SI-NEXT: v_alignbit_b32 v36, v7, v36, 16 +; SI-NEXT: v_alignbit_b32 v35, v9, v35, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v33, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v32, 16 +; SI-NEXT: v_alignbit_b32 v29, v15, v29, 16 +; SI-NEXT: v_alignbit_b32 v26, v17, v26, 16 +; SI-NEXT: v_alignbit_b32 v24, v19, v24, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v48 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v20 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v39 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v2, v2, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_or_b32_e32 v3, v20, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v20 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v38 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v37 ; SI-NEXT: v_or_b32_e32 v4, v4, v20 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 ; SI-NEXT: v_or_b32_e32 v8, v8, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; SI-NEXT: v_or_b32_e32 v9, v20, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v9, v9, v20 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v10, v10, v20 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v33 +; SI-NEXT: v_or_b32_e32 v10, v10, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: v_or_b32_e32 v11, v11, v20 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v32 +; SI-NEXT: v_or_b32_e32 v12, v12, v20 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_or_b32_e32 v13, v13, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v29 ; SI-NEXT: v_or_b32_e32 v14, v14, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v20, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_or_b32_e32 v15, v15, v20 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v32 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v26 ; SI-NEXT: v_or_b32_e32 v16, v16, v20 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v31 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_or_b32_e32 v17, v17, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v38 ; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40f16_to_v40i16: @@ -32540,374 +30659,326 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-LABEL: bitcast_v40f16_to_v40i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s22 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: s_lshr_b32 s7, s29, 16 +; SI-NEXT: s_lshr_b32 s6, s28, 16 ; SI-NEXT: s_lshr_b32 s8, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v56, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: s_lshr_b32 s7, s28, 16 -; SI-NEXT: s_lshr_b32 s9, s26, 16 -; SI-NEXT: s_lshr_b32 s11, s24, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: s_lshr_b32 s14, s18, 16 -; SI-NEXT: s_lshr_b32 s15, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v13 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v18 +; SI-NEXT: s_lshr_b32 s12, s26, 16 +; SI-NEXT: s_lshr_b32 s40, s25, 16 +; SI-NEXT: s_lshr_b32 s13, s24, 16 +; SI-NEXT: s_lshr_b32 s15, s23, 16 +; SI-NEXT: s_lshr_b32 s14, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s20, 16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB59_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: s_cbranch_execnz .LBB59_4 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s13 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v44 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 -; SI-NEXT: v_or_b32_e32 v53, v16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s19 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v43 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s23 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 +; SI-NEXT: v_or_b32_e32 v13, v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v9, v11, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v54, v16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 -; SI-NEXT: v_or_b32_e32 v44, v21, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v46 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v30 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_or_b32_e32 v43, v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_or_b32_e32 v11, v4, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v17, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; SI-NEXT: v_or_b32_e32 v26, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s7 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_or_b32_e32 v28, v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s16 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v24 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_or_b32_e32 v42, v16, v8 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_or_b32_e32 v5, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v46, v21, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s24 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v47, v15, v6 +; SI-NEXT: v_or_b32_e32 v45, v16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v58, v16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v22 -; SI-NEXT: v_or_b32_e32 v46, v21, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v55 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v24 -; SI-NEXT: v_or_b32_e32 v41, v18, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v59 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v55, v21, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v59, v18, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v42 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v21 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v42, v20, v18 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v48 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v57 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v50 -; SI-NEXT: v_or_b32_e32 v17, v17, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v20 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v21 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v57 -; SI-NEXT: v_or_b32_e32 v15, v15, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v52 -; SI-NEXT: v_or_b32_e32 v13, v13, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v56 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 -; SI-NEXT: v_or_b32_e32 v11, v11, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v20 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v20 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 -; SI-NEXT: v_or_b32_e32 v9, v9, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v49 -; SI-NEXT: v_or_b32_e32 v7, v7, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v21 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v51 -; SI-NEXT: v_or_b32_e32 v3, v3, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 -; SI-NEXT: v_or_b32_e32 v5, v5, v21 -; SI-NEXT: v_or_b32_e32 v1, v1, v20 -; SI-NEXT: v_lshr_b64 v[38:39], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[2:3], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[4:5], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[6:7], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[10:11], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[14:15], 16 -; SI-NEXT: v_lshr_b64 v[22:23], v[16:17], 16 -; SI-NEXT: v_lshr_b64 v[20:21], v[18:19], 16 -; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v44, v21, v17 +; SI-NEXT: v_or_b32_e32 v41, v22, v25 +; SI-NEXT: v_or_b32_e32 v43, v15, v27 +; SI-NEXT: v_or_b32_e32 v19, v16, v2 +; SI-NEXT: v_lshr_b64 v[21:22], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[27:28], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[4:5], 16 +; SI-NEXT: v_or_b32_e32 v14, v14, v0 +; SI-NEXT: v_or_b32_e32 v20, v20, v4 +; SI-NEXT: s_branch .LBB59_5 +; SI-NEXT: .LBB59_3: +; SI-NEXT: s_branch .LBB59_2 +; SI-NEXT: .LBB59_4: +; SI-NEXT: v_mov_b32_e32 v40, s7 +; SI-NEXT: v_mov_b32_e32 v55, s8 +; SI-NEXT: v_mov_b32_e32 v50, s40 +; SI-NEXT: v_mov_b32_e32 v51, s15 +; SI-NEXT: v_mov_b32_e32 v52, s11 +; SI-NEXT: v_mov_b32_e32 v53, s10 +; SI-NEXT: v_mov_b32_e32 v54, s9 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v7, s21 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v43, s28 +; SI-NEXT: v_mov_b32_e32 v41, s26 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v44, s24 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v45, s22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v47, s20 +; SI-NEXT: v_mov_b32_e32 v42, s18 +; SI-NEXT: v_mov_b32_e32 v46, s16 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v26, s27 +; SI-NEXT: v_mov_b32_e32 v28, s29 +; SI-NEXT: v_mov_b32_e32 v21, s43 +; SI-NEXT: v_mov_b32_e32 v37, s42 +; SI-NEXT: v_mov_b32_e32 v22, s41 +; SI-NEXT: v_mov_b32_e32 v35, s14 +; SI-NEXT: v_mov_b32_e32 v33, s13 +; SI-NEXT: v_mov_b32_e32 v31, s12 +; SI-NEXT: v_mov_b32_e32 v29, s6 +; SI-NEXT: .LBB59_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v46 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_or_b32_e32 v21, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v42 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v40 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v53 +; SI-NEXT: v_or_b32_e32 v23, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v47 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v44 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v52 +; SI-NEXT: v_or_b32_e32 v22, v6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 -; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v33 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 -; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v52 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v59 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v57 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v42 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v48 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v40 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_or_b32_e32 v15, v1, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v16, v1, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v17, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_or_b32_e32 v18, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 +; SI-NEXT: v_or_b32_e32 v19, v1, v3 +; SI-NEXT: v_mov_b32_e32 v1, v21 +; SI-NEXT: v_mov_b32_e32 v3, v23 +; SI-NEXT: v_mov_b32_e32 v5, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v40f16_to_v40i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index 8a0d00ea6164f..b074de310729d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -1387,44 +1387,28 @@ define <4 x half> @bitcast_i64_to_v4f16(i64 %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i64_to_v4f16: @@ -1494,37 +1478,27 @@ define inreg <4 x half> @bitcast_i64_to_v4f16_scalar(i64 inreg %a, i32 inreg %b) ; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s16, 3 -; SI-NEXT: s_addc_u32 s5, s17, 0 -; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: s_lshr_b32 s7, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_i64_to_v4f16_scalar: @@ -1600,16 +1574,10 @@ define i64 @bitcast_v4f16_to_i64(<4 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v4f16_to_i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1622,21 +1590,23 @@ define i64 @bitcast_v4f16_to_i64(<4 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB18_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1721,46 +1691,43 @@ define inreg i64 @bitcast_v4f16_to_i64_scalar(<4 x half> inreg %a, i32 inreg %b) ; SI-LABEL: bitcast_v4f16_to_i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s9, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB19_4 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: .LBB19_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 ; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_i64_scalar: ; VI: ; %bb.0: @@ -4644,42 +4611,27 @@ define <4 x half> @bitcast_f64_to_v4f16(double %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v4f16: @@ -4744,39 +4696,33 @@ define inreg <4 x half> @bitcast_f64_to_v4f16_scalar(double inreg %a, i32 inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: s_cbranch_scc0 .LBB41_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 -; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB41_4 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB41_5 +; SI-NEXT: .LBB41_3: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB41_2 +; SI-NEXT: .LBB41_4: +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: .LBB41_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v4f16_scalar: ; VI: ; %bb.0: @@ -4850,16 +4796,10 @@ define double @bitcast_v4f16_to_f64(<4 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v4f16_to_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4872,21 +4812,23 @@ define double @bitcast_v4f16_to_f64(<4 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB42_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: .LBB42_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4971,46 +4913,43 @@ define inreg double @bitcast_v4f16_to_f64_scalar(<4 x half> inreg %a, i32 inreg ; SI-LABEL: bitcast_v4f16_to_f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s9, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: s_cbranch_scc0 .LBB43_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB43_4 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB43_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: .LBB43_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 ; SI-NEXT: s_branch .LBB43_2 +; SI-NEXT: .LBB43_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_f64_scalar: ; VI: ; %bb.0: @@ -7587,44 +7526,28 @@ define <4 x half> @bitcast_v2i32_to_v4f16(<2 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB60_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: .LBB60_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB60_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: .LBB60_4: ; %end +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i32_to_v4f16: @@ -7693,37 +7616,27 @@ define inreg <4 x half> @bitcast_v2i32_to_v4f16_scalar(<2 x i32> inreg %a, i32 i ; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true -; SI-NEXT: s_add_i32 s4, s16, 3 -; SI-NEXT: s_add_i32 s6, s17, 3 -; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: s_lshr_b32 s7, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 ; SI-NEXT: .LBB61_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB61_4: -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB61_4: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB61_2 ; ; VI-LABEL: bitcast_v2i32_to_v4f16_scalar: @@ -7799,16 +7712,10 @@ define <2 x i32> @bitcast_v4f16_to_v2i32(<4 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v4f16_to_v2i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -7821,21 +7728,23 @@ define <2 x i32> @bitcast_v4f16_to_v2i32(<4 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB62_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: .LBB62_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -7920,46 +7829,43 @@ define inreg <2 x i32> @bitcast_v4f16_to_v2i32_scalar(<4 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v4f16_to_v2i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s9, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: s_cbranch_scc0 .LBB63_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB63_4 ; SI-NEXT: .LBB63_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB63_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: .LBB63_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 ; SI-NEXT: s_branch .LBB63_2 +; SI-NEXT: .LBB63_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_v2i32_scalar: ; VI: ; %bb.0: @@ -10208,44 +10114,28 @@ define <4 x half> @bitcast_v2f32_to_v4f16(<2 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB76_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: .LBB76_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB76_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: .LBB76_4: ; %end +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v4f16: @@ -10311,40 +10201,34 @@ define inreg <4 x half> @bitcast_v2f32_to_v4f16_scalar(<2 x float> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB77_4 +; SI-NEXT: s_cbranch_scc0 .LBB77_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 -; SI-NEXT: s_cbranch_execnz .LBB77_3 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB77_4 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: .LBB77_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB77_4: -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB77_5 +; SI-NEXT: .LBB77_3: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB77_2 +; SI-NEXT: .LBB77_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: .LBB77_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v4f16_scalar: ; VI: ; %bb.0: @@ -10421,16 +10305,10 @@ define <2 x float> @bitcast_v4f16_to_v2f32(<4 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v4f16_to_v2f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -10443,21 +10321,23 @@ define <2 x float> @bitcast_v4f16_to_v2f32(<4 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB78_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_2 ; SI-NEXT: .LBB78_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -10542,46 +10422,43 @@ define inreg <2 x float> @bitcast_v4f16_to_v2f32_scalar(<4 x half> inreg %a, i32 ; SI-LABEL: bitcast_v4f16_to_v2f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s9, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: s_cbranch_scc0 .LBB79_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB79_4 ; SI-NEXT: .LBB79_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB79_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: .LBB79_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 ; SI-NEXT: s_branch .LBB79_2 +; SI-NEXT: .LBB79_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_v2f32_scalar: ; VI: ; %bb.0: @@ -12439,47 +12316,48 @@ define <4 x half> @bitcast_v4i16_to_v4f16(<4 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v4i16_to_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB88_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v4, v1, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v5, v4, v6, 16 +; SI-NEXT: v_or_b32_e32 v2, v0, v6 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: .LBB88_2: ; %Flow +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB88_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v7 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v6, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v0 +; SI-NEXT: v_alignbit_b32 v5, v4, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; SI-NEXT: .LBB88_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i16_to_v4f16: @@ -12550,40 +12428,46 @@ define inreg <4 x half> @bitcast_v4i16_to_v4f16_scalar(<4 x i16> inreg %a, i32 i ; SI-LABEL: bitcast_v4i16_to_v4f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s10, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s12, s11, 16 +; SI-NEXT: s_or_b32 s13, s5, s6 +; SI-NEXT: s_or_b32 s4, s4, s12 +; SI-NEXT: s_lshr_b64 s[6:7], s[12:13], 16 +; SI-NEXT: s_mov_b32 s5, s13 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s10, 16 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s10, s5, 16 ; SI-NEXT: .LBB89_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s6, s10, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB89_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB89_2 ; ; VI-LABEL: bitcast_v4i16_to_v4f16_scalar: @@ -12670,14 +12554,6 @@ define <4 x i16> @bitcast_v4f16_to_v4i16(<4 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -12780,39 +12656,39 @@ define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v4f16_to_v4i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s7, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB91_4 +; SI-NEXT: s_cbranch_scc0 .LBB91_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB91_3 +; SI-NEXT: s_cbranch_execnz .LBB91_4 ; SI-NEXT: .LBB91_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 ; SI-NEXT: v_or_b32_e32 v0, v5, v0 -; SI-NEXT: .LBB91_3: ; %end +; SI-NEXT: s_branch .LBB91_5 +; SI-NEXT: .LBB91_3: +; SI-NEXT: s_branch .LBB91_2 +; SI-NEXT: .LBB91_4: +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: .LBB91_5: ; %end ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 @@ -12820,8 +12696,6 @@ define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB91_4: -; SI-NEXT: s_branch .LBB91_2 ; ; VI-LABEL: bitcast_v4f16_to_v4i16_scalar: ; VI: ; %bb.0: @@ -14769,62 +14643,54 @@ define <4 x bfloat> @bitcast_v4f16_to_v4bf16(<4 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v4f16_to_v4bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB100_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: .LBB100_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_v4bf16: @@ -14896,57 +14762,55 @@ define inreg <4 x bfloat> @bitcast_v4f16_to_v4bf16_scalar(<4 x half> inreg %a, i ; SI-LABEL: bitcast_v4f16_to_v4bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v4 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB101_4 +; SI-NEXT: s_cbranch_scc0 .LBB101_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; SI-NEXT: s_cbranch_execnz .LBB101_3 +; SI-NEXT: s_lshl_b32 s8, s16, 16 +; SI-NEXT: s_lshl_b32 s9, s6, 16 +; SI-NEXT: s_lshl_b32 s10, s17, 16 +; SI-NEXT: s_lshl_b32 s11, s7, 16 +; SI-NEXT: s_cbranch_execnz .LBB101_4 ; SI-NEXT: .LBB101_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: .LBB101_3: ; %end -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[0:1], v[5:6], 16 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: s_branch .LBB101_5 +; SI-NEXT: .LBB101_3: +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: s_branch .LBB101_2 +; SI-NEXT: .LBB101_4: +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: .LBB101_5: ; %end +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB101_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_branch .LBB101_2 ; ; VI-LABEL: bitcast_v4f16_to_v4bf16_scalar: ; VI: ; %bb.0: @@ -15035,63 +14899,56 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB102_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_alignbit_b32 v2, v1, v5, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v3, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_alignbit_b32 v0, v0, v7, 16 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: .LBB102_2: ; %Flow +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_alignbit_b32 v3, v2, v3, 16 ; SI-NEXT: .LBB102_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4bf16_to_v4f16: @@ -15319,53 +15176,46 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i ; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s7, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s7 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s5 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s7 ; SI-NEXT: s_cbranch_scc0 .LBB103_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v11 +; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; SI-NEXT: v_lshr_b64 v[9:10], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[7:8], 16 ; SI-NEXT: s_cbranch_execnz .LBB103_3 ; SI-NEXT: .LBB103_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshr_b64 v[5:6], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_lshr_b64 v[9:10], v[0:1], 16 ; SI-NEXT: .LBB103_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB103_4: -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB103_2 ; ; VI-LABEL: bitcast_v4bf16_to_v4f16_scalar: @@ -15617,16 +15467,10 @@ define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v4f16_to_v8i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_mov_b32_e32 v8, v1 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -15645,10 +15489,12 @@ define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB104_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v9, v0 -; SI-NEXT: v_or_b32_e32 v4, v8, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -15862,64 +15708,68 @@ define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inr ; SI-LABEL: bitcast_v4f16_to_v8i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: s_lshr_b32 s5, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b32 s15, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB105_4 +; SI-NEXT: s_cbranch_scc0 .LBB105_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; SI-NEXT: v_or_b32_e32 v9, v8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: v_lshr_b64 v[3:4], v[9:10], 24 -; SI-NEXT: v_lshr_b64 v[4:5], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v10 -; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; SI-NEXT: s_cbranch_execnz .LBB105_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s14, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s7, s5, 8 +; SI-NEXT: s_bfe_u32 s9, s14, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB105_4 ; SI-NEXT: .LBB105_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v9, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v9, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_or_b32_e32 v10, v2, v0 ; SI-NEXT: v_lshr_b64 v[3:4], v[9:10], 24 ; SI-NEXT: v_lshr_b64 v[4:5], v[9:10], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 8 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v10 ; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; SI-NEXT: .LBB105_3: ; %end +; SI-NEXT: s_branch .LBB105_5 +; SI-NEXT: .LBB105_3: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: s_branch .LBB105_2 +; SI-NEXT: .LBB105_4: +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v10, s5 +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: .LBB105_5: ; %end ; SI-NEXT: v_mov_b32_e32 v0, v9 ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v4, v10 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB105_4: -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: s_branch .LBB105_2 ; ; VI-LABEL: bitcast_v4f16_to_v8i8_scalar: ; VI: ; %bb.0: @@ -16083,72 +15933,78 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v4, v7, v3 +; SI-NEXT: v_or_b32_e32 v3, v1, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v5, v3, v1, 16 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: .LBB106_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x300, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 ; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_alignbit_b32 v5, v3, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; SI-NEXT: .LBB106_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i8_to_v4f16: @@ -16440,60 +16296,68 @@ define inreg <4 x half> @bitcast_v8i8_to_v4f16_scalar(<8 x i8> inreg %a, i32 inr ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s19, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s23, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_or_b32 s10, s6, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s7, s6 +; SI-NEXT: s_or_b32 s11, s5, s8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshr_b64 s[6:7], s[10:11], 16 +; SI-NEXT: s_or_b32 s4, s4, s10 +; SI-NEXT: s_lshr_b32 s7, s8, 16 +; SI-NEXT: s_mov_b32 s5, s11 ; SI-NEXT: s_cbranch_execnz .LBB107_3 ; SI-NEXT: .LBB107_2: ; %cmp.true -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s23, 8 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s18, 0xff -; SI-NEXT: s_lshl_b32 s7, s19, 8 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s16, 0xff -; SI-NEXT: s_lshl_b32 s8, s17, 8 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_and_b32 s7, s22, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 ; SI-NEXT: .LBB107_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s6, s7, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB107_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB107_2 ; ; VI-LABEL: bitcast_v8i8_to_v4f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll index 6e2167edd97cd..d3fd1ab06c1c2 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -5540,393 +5540,192 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v22 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v30, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v35, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v30, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v35, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v47 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v40 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v22i32_to_v44f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v51 +; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v50 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v28 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v10, v10, v27 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v26 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v24 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v18, v18, v23 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v5, v5, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v30 +; SI-NEXT: v_or_b32_e32 v9, v9, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v26 +; SI-NEXT: v_or_b32_e32 v15, v15, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v24 +; SI-NEXT: v_or_b32_e32 v19, v19, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v22i32_to_v44f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr25 @@ -6381,103 +6180,57 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v9, s27 ; SI-NEXT: v_readfirstlane_b32 s25, v10 ; SI-NEXT: v_mov_b32_e32 v10, s28 -; SI-NEXT: v_readfirstlane_b32 s26, v11 +; SI-NEXT: v_readfirstlane_b32 s22, v11 ; SI-NEXT: v_mov_b32_e32 v11, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_readfirstlane_b32 s27, v12 -; SI-NEXT: v_readfirstlane_b32 s23, v13 -; SI-NEXT: v_readfirstlane_b32 s22, v14 -; SI-NEXT: v_readfirstlane_b32 s21, v15 -; SI-NEXT: v_readfirstlane_b32 s20, v16 -; SI-NEXT: v_readfirstlane_b32 s19, v17 -; SI-NEXT: v_readfirstlane_b32 s18, v18 -; SI-NEXT: v_readfirstlane_b32 s17, v19 -; SI-NEXT: v_readfirstlane_b32 s16, v9 -; SI-NEXT: v_readfirstlane_b32 s15, v10 -; SI-NEXT: v_readfirstlane_b32 s14, v11 -; SI-NEXT: v_readfirstlane_b32 s13, v0 -; SI-NEXT: v_readfirstlane_b32 s12, v1 -; SI-NEXT: v_readfirstlane_b32 s11, v2 -; SI-NEXT: v_readfirstlane_b32 s10, v3 -; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_readfirstlane_b32 s23, v12 +; SI-NEXT: v_readfirstlane_b32 s20, v13 +; SI-NEXT: v_readfirstlane_b32 s21, v14 +; SI-NEXT: v_readfirstlane_b32 s18, v15 +; SI-NEXT: v_readfirstlane_b32 s19, v16 +; SI-NEXT: v_readfirstlane_b32 s16, v17 +; SI-NEXT: v_readfirstlane_b32 s17, v18 +; SI-NEXT: v_readfirstlane_b32 s14, v19 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v0 +; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v2 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_readfirstlane_b32 s6, v4 ; SI-NEXT: v_readfirstlane_b32 s7, v5 -; SI-NEXT: v_readfirstlane_b32 s6, v6 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v7 +; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v7 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s24 +; SI-NEXT: s_lshr_b32 s76, s5, 16 +; SI-NEXT: s_lshr_b32 s77, s7, 16 +; SI-NEXT: s_lshr_b32 s78, s9, 16 +; SI-NEXT: s_lshr_b32 s79, s11, 16 +; SI-NEXT: s_lshr_b32 s88, s13, 16 +; SI-NEXT: s_lshr_b32 s89, s15, 16 +; SI-NEXT: s_lshr_b32 s90, s17, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 16 +; SI-NEXT: s_lshr_b32 s92, s21, 16 +; SI-NEXT: s_lshr_b32 s93, s23, 16 +; SI-NEXT: s_lshr_b32 s94, s25, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 @@ -6492,217 +6245,153 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_lshr_b32 s5, s25, 16 -; SI-NEXT: s_lshr_b32 s28, s26, 16 -; SI-NEXT: s_lshr_b32 s29, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s23, 16 -; SI-NEXT: s_lshr_b32 s41, s22, 16 -; SI-NEXT: s_lshr_b32 s42, s21, 16 -; SI-NEXT: s_lshr_b32 s43, s20, 16 -; SI-NEXT: s_lshr_b32 s44, s19, 16 -; SI-NEXT: s_lshr_b32 s45, s18, 16 -; SI-NEXT: s_lshr_b32 s46, s17, 16 -; SI-NEXT: s_lshr_b32 s47, s16, 16 -; SI-NEXT: s_lshr_b32 s56, s15, 16 -; SI-NEXT: s_lshr_b32 s57, s14, 16 -; SI-NEXT: s_lshr_b32 s58, s13, 16 -; SI-NEXT: s_lshr_b32 s59, s12, 16 -; SI-NEXT: s_lshr_b32 s60, s11, 16 -; SI-NEXT: s_lshr_b32 s61, s10, 16 -; SI-NEXT: s_lshr_b32 s62, s8, 16 -; SI-NEXT: s_lshr_b32 s63, s7, 16 -; SI-NEXT: s_lshr_b32 s72, s6, 16 -; SI-NEXT: s_lshr_b32 s73, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: s_lshr_b32 s76, s5, 16 +; SI-NEXT: s_lshr_b32 s77, s7, 16 +; SI-NEXT: s_lshr_b32 s78, s9, 16 +; SI-NEXT: s_lshr_b32 s79, s11, 16 +; SI-NEXT: s_lshr_b32 s88, s13, 16 +; SI-NEXT: s_lshr_b32 s89, s15, 16 +; SI-NEXT: s_lshr_b32 s90, s17, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 16 +; SI-NEXT: s_lshr_b32 s92, s21, 16 +; SI-NEXT: s_lshr_b32 s93, s23, 16 +; SI-NEXT: s_lshr_b32 s94, s25, 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v2, v48, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v48 -; SI-NEXT: v_or_b32_e32 v5, v5, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v38 -; SI-NEXT: v_or_b32_e32 v7, v7, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_or_b32_e32 v9, v34, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 -; SI-NEXT: v_or_b32_e32 v11, v32, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; SI-NEXT: v_or_b32_e32 v13, v30, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; SI-NEXT: v_or_b32_e32 v15, v28, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 -; SI-NEXT: v_or_b32_e32 v17, v26, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v19, v24, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v3, v50, v3 -; SI-NEXT: v_or_b32_e32 v4, v39, v4 -; SI-NEXT: v_or_b32_e32 v6, v37, v6 -; SI-NEXT: v_or_b32_e32 v8, v35, v8 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v12, v31, v12 -; SI-NEXT: v_or_b32_e32 v14, v29, v14 -; SI-NEXT: v_or_b32_e32 v16, v27, v16 -; SI-NEXT: v_or_b32_e32 v18, v25, v18 -; SI-NEXT: v_or_b32_e32 v20, v23, v20 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: s_branch .LBB17_2 -; -; VI-LABEL: bitcast_v22i32_to_v44f16_scalar: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: s_lshl_b32 s27, s72, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s27 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s27, s94, 16 +; SI-NEXT: s_or_b32 s25, s25, s27 +; SI-NEXT: s_lshl_b32 s27, s62, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s27 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s27, s93, 16 +; SI-NEXT: s_or_b32 s23, s23, s27 +; SI-NEXT: s_lshl_b32 s27, s60, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s27 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s27, s92, 16 +; SI-NEXT: s_or_b32 s21, s21, s27 +; SI-NEXT: s_lshl_b32 s27, s58, 16 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s18, s18, s27 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s27, s91, 16 +; SI-NEXT: s_or_b32 s19, s19, s27 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s27, s56, 16 +; SI-NEXT: s_or_b32 s16, s16, s27 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s27, s90, 16 +; SI-NEXT: s_or_b32 s17, s17, s27 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s27, s46, 16 +; SI-NEXT: s_or_b32 s14, s14, s27 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s27, s89, 16 +; SI-NEXT: s_or_b32 s15, s15, s27 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s27, s44, 16 +; SI-NEXT: s_or_b32 s12, s12, s27 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s27, s88, 16 +; SI-NEXT: s_or_b32 s13, s13, s27 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s27, s42, 16 +; SI-NEXT: s_or_b32 s10, s10, s27 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s27, s79, 16 +; SI-NEXT: s_or_b32 s11, s11, s27 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s27, s40, 16 +; SI-NEXT: s_or_b32 s8, s8, s27 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s27, s78, 16 +; SI-NEXT: s_or_b32 s9, s9, s27 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s27, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s6, s6, s27 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s27, s77, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s76, 16 +; SI-NEXT: s_or_b32 s7, s7, s27 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_mov_b32_e32 v1, s25 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: v_mov_b32_e32 v3, s23 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: v_mov_b32_e32 v11, s15 +; SI-NEXT: v_mov_b32_e32 v12, s12 +; SI-NEXT: v_mov_b32_e32 v13, s13 +; SI-NEXT: v_mov_b32_e32 v14, s10 +; SI-NEXT: v_mov_b32_e32 v15, s11 +; SI-NEXT: v_mov_b32_e32 v16, s8 +; SI-NEXT: v_mov_b32_e32 v17, s9 +; SI-NEXT: v_mov_b32_e32 v18, s6 +; SI-NEXT: v_mov_b32_e32 v19, s7 +; SI-NEXT: v_mov_b32_e32 v20, s4 +; SI-NEXT: v_mov_b32_e32 v21, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v22i32_to_v44f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, s16 ; VI-NEXT: v_mov_b32_e32 v10, s17 ; VI-NEXT: v_mov_b32_e32 v11, s18 ; VI-NEXT: v_mov_b32_e32 v12, s19 @@ -7296,57 +6985,6 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v44f16_to_v22i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -7363,142 +7001,137 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_mov_b32_e32 v45, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v34, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v36, v17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v11 +; SI-NEXT: v_mov_b32_e32 v51, v10 +; SI-NEXT: v_mov_b32_e32 v52, v9 +; SI-NEXT: v_mov_b32_e32 v53, v8 +; SI-NEXT: v_mov_b32_e32 v54, v7 +; SI-NEXT: v_mov_b32_e32 v55, v6 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v42, v3 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v45 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v59 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v58 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v53 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v62, v6 -; SI-NEXT: v_or_b32_e32 v7, v60, v7 -; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_or_b32_e32 v9, v56, v9 -; SI-NEXT: v_or_b32_e32 v10, v46, v10 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_or_b32_e32 v13, v40, v13 -; SI-NEXT: v_or_b32_e32 v14, v54, v14 -; SI-NEXT: v_or_b32_e32 v15, v52, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v49 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -7508,45 +7141,58 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v38 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -7576,10 +7222,10 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -7592,10 +7238,10 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -7604,168 +7250,170 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v45 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -8384,196 +8032,126 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-LABEL: bitcast_v44f16_to_v22i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s40 -; SI-NEXT: s_lshr_b32 s40, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s19 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s21 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v34, v5 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v36, v3 +; SI-NEXT: v_mov_b32_e32 v37, v2 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v39, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v38 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v39 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_or_b32_e32 v2, v21, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_or_b32_e32 v4, v56, v4 -; SI-NEXT: v_or_b32_e32 v5, v50, v5 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v7, v38, v7 -; SI-NEXT: v_or_b32_e32 v8, v37, v8 -; SI-NEXT: v_or_b32_e32 v9, v34, v9 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v44, v12 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v54, v15 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: v_or_b32_e32 v18, v28, v18 -; SI-NEXT: v_or_b32_e32 v19, v26, v19 -; SI-NEXT: v_or_b32_e32 v20, v24, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -8586,167 +8164,160 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v51 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v32 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 @@ -8754,111 +8325,9 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: .LBB19_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v51, v33 -; SI-NEXT: v_mov_b32_e32 v50, v32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v48, v47 -; SI-NEXT: v_mov_b32_e32 v47, v53 -; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v39, v56 -; SI-NEXT: v_mov_b32_e32 v56, v54 -; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v38, v57 -; SI-NEXT: v_mov_b32_e32 v57, v55 -; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v40 -; SI-NEXT: v_mov_b32_e32 v40, v26 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v36, v59 -; SI-NEXT: v_mov_b32_e32 v59, v41 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v42 -; SI-NEXT: v_mov_b32_e32 v42, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v61, v43 -; SI-NEXT: v_mov_b32_e32 v43, v29 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v44 -; SI-NEXT: v_mov_b32_e32 v44, v30 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v45 -; SI-NEXT: v_mov_b32_e32 v45, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v45 -; SI-NEXT: v_mov_b32_e32 v45, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v30, v44 -; SI-NEXT: v_mov_b32_e32 v44, v62 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v29, v43 -; SI-NEXT: v_mov_b32_e32 v43, v61 -; SI-NEXT: v_mov_b32_e32 v61, v34 -; SI-NEXT: v_mov_b32_e32 v28, v42 -; SI-NEXT: v_mov_b32_e32 v42, v60 -; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: v_mov_b32_e32 v27, v41 -; SI-NEXT: v_mov_b32_e32 v41, v59 -; SI-NEXT: v_mov_b32_e32 v59, v36 -; SI-NEXT: v_mov_b32_e32 v26, v40 -; SI-NEXT: v_mov_b32_e32 v40, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v25, v55 -; SI-NEXT: v_mov_b32_e32 v55, v57 -; SI-NEXT: v_mov_b32_e32 v57, v38 -; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v54, v56 -; SI-NEXT: v_mov_b32_e32 v56, v39 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v53, v47 -; SI-NEXT: v_mov_b32_e32 v47, v48 -; SI-NEXT: v_mov_b32_e32 v32, v50 -; SI-NEXT: v_mov_b32_e32 v33, v51 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v44f16_to_v22i32_scalar: @@ -14098,371 +13567,170 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v22 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v30, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v35, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v30, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v35, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v47 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v40 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v51 +; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v50 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v28 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v10, v10, v27 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v26 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v24 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v18, v18, v23 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v5, v5, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v30 +; SI-NEXT: v_or_b32_e32 v9, v9, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v26 +; SI-NEXT: v_or_b32_e32 v15, v15, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v24 +; SI-NEXT: v_or_b32_e32 v19, v19, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f32_to_v44f16: @@ -14903,361 +14171,190 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v21, s16 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, s18 -; SI-NEXT: v_mov_b32_e32 v62, s19 -; SI-NEXT: v_mov_b32_e32 v60, s20 -; SI-NEXT: v_mov_b32_e32 v59, s21 -; SI-NEXT: v_mov_b32_e32 v58, s22 -; SI-NEXT: v_mov_b32_e32 v47, s23 -; SI-NEXT: v_mov_b32_e32 v44, s24 -; SI-NEXT: v_mov_b32_e32 v43, s25 -; SI-NEXT: v_mov_b32_e32 v42, s26 -; SI-NEXT: v_mov_b32_e32 v56, s27 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v10, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v57, s28 -; SI-NEXT: v_mov_b32_e32 v45, s29 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v58 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v22 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v21 +; SI-NEXT: v_lshr_b64 v[30:31], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[9:10], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v23 +; SI-NEXT: v_lshr_b64 v[37:38], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[22:23], 16 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v63 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v47 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v62 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v60 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v59 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v58 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v44 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v43 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v42 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v56 -; SI-NEXT: v_add_f32_e32 v31, 1.0, v57 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v45 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[30:31], v[6:7], 16 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[31:32], v[4:5], 16 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[32:33], v[2:3], 16 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_lshr_b64 v[33:34], v[0:1], 16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_lshr_b64 v[34:35], v[12:13], 16 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_lshr_b64 v[35:36], v[14:15], 16 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_lshr_b64 v[36:37], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v23 ; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v38 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 -; SI-NEXT: v_or_b32_e32 v7, v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v8, v20, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v33 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v10, v20, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v31 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v20, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v29 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v20, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v16, v20, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v9, v34, v9 -; SI-NEXT: v_or_b32_e32 v11, v32, v11 -; SI-NEXT: v_or_b32_e32 v13, v30, v13 -; SI-NEXT: v_or_b32_e32 v15, v28, v15 -; SI-NEXT: v_or_b32_e32 v17, v26, v17 -; SI-NEXT: v_or_b32_e32 v19, v24, v19 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v24, v22, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v55 +; SI-NEXT: v_or_b32_e32 v25, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v26, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v54 +; SI-NEXT: v_or_b32_e32 v27, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v28, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v53 +; SI-NEXT: v_or_b32_e32 v29, v18, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v22, v16, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v23, v16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v52 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v24 +; SI-NEXT: v_mov_b32_e32 v1, v25 +; SI-NEXT: v_mov_b32_e32 v2, v26 +; SI-NEXT: v_mov_b32_e32 v3, v27 +; SI-NEXT: v_mov_b32_e32 v4, v28 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: v_mov_b32_e32 v6, v22 +; SI-NEXT: v_mov_b32_e32 v7, v23 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v22f32_to_v44f16_scalar: @@ -15906,57 +15003,6 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v44f16_to_v22f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -15973,142 +15019,137 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_mov_b32_e32 v45, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v34, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v36, v17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v11 +; SI-NEXT: v_mov_b32_e32 v51, v10 +; SI-NEXT: v_mov_b32_e32 v52, v9 +; SI-NEXT: v_mov_b32_e32 v53, v8 +; SI-NEXT: v_mov_b32_e32 v54, v7 +; SI-NEXT: v_mov_b32_e32 v55, v6 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v42, v3 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v45 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v59 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v58 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v53 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v62, v6 -; SI-NEXT: v_or_b32_e32 v7, v60, v7 -; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_or_b32_e32 v9, v56, v9 -; SI-NEXT: v_or_b32_e32 v10, v46, v10 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_or_b32_e32 v13, v40, v13 -; SI-NEXT: v_or_b32_e32 v14, v54, v14 -; SI-NEXT: v_or_b32_e32 v15, v52, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v49 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -16118,45 +15159,58 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v38 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -16186,10 +15240,10 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -16202,10 +15256,10 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -16214,168 +15268,170 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v45 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -16994,196 +16050,126 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-LABEL: bitcast_v44f16_to_v22f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s40 -; SI-NEXT: s_lshr_b32 s40, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s19 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s21 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v34, v5 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v36, v3 +; SI-NEXT: v_mov_b32_e32 v37, v2 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v39, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v38 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v39 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_or_b32_e32 v2, v21, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_or_b32_e32 v4, v56, v4 -; SI-NEXT: v_or_b32_e32 v5, v50, v5 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v7, v38, v7 -; SI-NEXT: v_or_b32_e32 v8, v37, v8 -; SI-NEXT: v_or_b32_e32 v9, v34, v9 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v44, v12 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v54, v15 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: v_or_b32_e32 v18, v28, v18 -; SI-NEXT: v_or_b32_e32 v19, v26, v19 -; SI-NEXT: v_or_b32_e32 v20, v24, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -17196,167 +16182,160 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v51 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v32 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 @@ -17364,111 +16343,9 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: .LBB35_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v51, v33 -; SI-NEXT: v_mov_b32_e32 v50, v32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v48, v47 -; SI-NEXT: v_mov_b32_e32 v47, v53 -; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v39, v56 -; SI-NEXT: v_mov_b32_e32 v56, v54 -; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v38, v57 -; SI-NEXT: v_mov_b32_e32 v57, v55 -; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v40 -; SI-NEXT: v_mov_b32_e32 v40, v26 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v36, v59 -; SI-NEXT: v_mov_b32_e32 v59, v41 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v42 -; SI-NEXT: v_mov_b32_e32 v42, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v61, v43 -; SI-NEXT: v_mov_b32_e32 v43, v29 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v44 -; SI-NEXT: v_mov_b32_e32 v44, v30 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v45 -; SI-NEXT: v_mov_b32_e32 v45, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v45 -; SI-NEXT: v_mov_b32_e32 v45, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v30, v44 -; SI-NEXT: v_mov_b32_e32 v44, v62 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v29, v43 -; SI-NEXT: v_mov_b32_e32 v43, v61 -; SI-NEXT: v_mov_b32_e32 v61, v34 -; SI-NEXT: v_mov_b32_e32 v28, v42 -; SI-NEXT: v_mov_b32_e32 v42, v60 -; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: v_mov_b32_e32 v27, v41 -; SI-NEXT: v_mov_b32_e32 v41, v59 -; SI-NEXT: v_mov_b32_e32 v59, v36 -; SI-NEXT: v_mov_b32_e32 v26, v40 -; SI-NEXT: v_mov_b32_e32 v40, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v25, v55 -; SI-NEXT: v_mov_b32_e32 v55, v57 -; SI-NEXT: v_mov_b32_e32 v57, v38 -; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v54, v56 -; SI-NEXT: v_mov_b32_e32 v56, v39 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v53, v47 -; SI-NEXT: v_mov_b32_e32 v47, v48 -; SI-NEXT: v_mov_b32_e32 v32, v50 -; SI-NEXT: v_mov_b32_e32 v33, v51 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v44f16_to_v22f32_scalar: @@ -21934,198 +20811,87 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB43_3: ; %end -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB43_4: -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_branch .LBB43_2 - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = add <44 x i16> %a, splat (i16 3) - %a2 = bitcast <44 x i16> %a1 to <11 x i64> - br label %end - -cmp.false: - %a3 = bitcast <44 x i16> %a to <11 x i64> - br label %end - -end: - %phi = phi <11 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <11 x i64> %phi -} - -define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { -; SI-LABEL: bitcast_v11i64_to_v44f16: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB43_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <44 x i16> %a, splat (i16 3) + %a2 = bitcast <44 x i16> %a1 to <11 x i64> + br label %end + +cmp.false: + %a3 = bitcast <44 x i16> %a to <11 x i64> + br label %end + +end: + %phi = phi <11 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i64> %phi +} + +define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v11i64_to_v44f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v22 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v29, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v34, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 @@ -22150,188 +20916,98 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v29, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v34, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v47 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v40 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v51 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v50 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v29 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v28 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v10, v10, v27 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v26 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v35 +; SI-NEXT: v_or_b32_e32 v16, v16, v24 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v18, v18, v23 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v26 +; SI-NEXT: v_or_b32_e32 v15, v15, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v24 +; SI-NEXT: v_or_b32_e32 v19, v19, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11i64_to_v44f16: @@ -22818,324 +21494,214 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v19, s26 ; SI-NEXT: v_readfirstlane_b32 s24, v9 ; SI-NEXT: v_mov_b32_e32 v9, s27 -; SI-NEXT: v_readfirstlane_b32 s26, v10 +; SI-NEXT: v_readfirstlane_b32 s25, v10 ; SI-NEXT: v_mov_b32_e32 v10, s28 -; SI-NEXT: v_readfirstlane_b32 s25, v11 +; SI-NEXT: v_readfirstlane_b32 s22, v11 ; SI-NEXT: v_mov_b32_e32 v11, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_readfirstlane_b32 s27, v12 -; SI-NEXT: v_readfirstlane_b32 s22, v13 -; SI-NEXT: v_readfirstlane_b32 s23, v14 -; SI-NEXT: v_readfirstlane_b32 s20, v15 -; SI-NEXT: v_readfirstlane_b32 s21, v16 -; SI-NEXT: v_readfirstlane_b32 s18, v17 -; SI-NEXT: v_readfirstlane_b32 s19, v18 -; SI-NEXT: v_readfirstlane_b32 s16, v19 -; SI-NEXT: v_readfirstlane_b32 s17, v9 -; SI-NEXT: v_readfirstlane_b32 s14, v10 -; SI-NEXT: v_readfirstlane_b32 s15, v11 -; SI-NEXT: v_readfirstlane_b32 s12, v0 -; SI-NEXT: v_readfirstlane_b32 s13, v1 -; SI-NEXT: v_readfirstlane_b32 s10, v2 -; SI-NEXT: v_readfirstlane_b32 s11, v3 -; SI-NEXT: v_readfirstlane_b32 s7, v4 -; SI-NEXT: v_readfirstlane_b32 s8, v5 -; SI-NEXT: v_readfirstlane_b32 s6, v6 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v7 +; SI-NEXT: v_readfirstlane_b32 s23, v12 +; SI-NEXT: v_readfirstlane_b32 s20, v13 +; SI-NEXT: v_readfirstlane_b32 s21, v14 +; SI-NEXT: v_readfirstlane_b32 s18, v15 +; SI-NEXT: v_readfirstlane_b32 s19, v16 +; SI-NEXT: v_readfirstlane_b32 s16, v17 +; SI-NEXT: v_readfirstlane_b32 s17, v18 +; SI-NEXT: v_readfirstlane_b32 s14, v19 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v0 +; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v2 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_readfirstlane_b32 s6, v4 +; SI-NEXT: v_readfirstlane_b32 s7, v5 +; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v7 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s24 +; SI-NEXT: s_lshr_b32 s76, s5, 16 +; SI-NEXT: s_lshr_b32 s77, s7, 16 +; SI-NEXT: s_lshr_b32 s78, s9, 16 +; SI-NEXT: s_lshr_b32 s79, s11, 16 +; SI-NEXT: s_lshr_b32 s88, s13, 16 +; SI-NEXT: s_lshr_b32 s89, s15, 16 +; SI-NEXT: s_lshr_b32 s90, s17, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 16 +; SI-NEXT: s_lshr_b32 s92, s21, 16 +; SI-NEXT: s_lshr_b32 s93, s23, 16 +; SI-NEXT: s_lshr_b32 s94, s25, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s24, 3 -; SI-NEXT: s_addc_u32 s5, s26, 0 -; SI-NEXT: s_lshr_b32 s24, s4, 16 -; SI-NEXT: s_lshr_b32 s26, s5, 16 -; SI-NEXT: s_add_u32 s25, s25, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s28, s25, 16 -; SI-NEXT: s_lshr_b32 s29, s27, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s40, s22, 16 -; SI-NEXT: s_lshr_b32 s41, s23, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s42, s20, 16 -; SI-NEXT: s_lshr_b32 s43, s21, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s44, s18, 16 -; SI-NEXT: s_lshr_b32 s45, s19, 16 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s46, s16, 16 -; SI-NEXT: s_lshr_b32 s47, s17, 16 -; SI-NEXT: s_add_u32 s14, s14, 3 -; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_lshr_b32 s56, s14, 16 -; SI-NEXT: s_lshr_b32 s57, s15, 16 -; SI-NEXT: s_add_u32 s12, s12, 3 -; SI-NEXT: s_addc_u32 s13, s13, 0 -; SI-NEXT: s_lshr_b32 s58, s12, 16 -; SI-NEXT: s_lshr_b32 s59, s13, 16 -; SI-NEXT: s_add_u32 s10, s10, 3 -; SI-NEXT: s_addc_u32 s11, s11, 0 -; SI-NEXT: s_lshr_b32 s60, s10, 16 -; SI-NEXT: s_lshr_b32 s61, s11, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_lshr_b32 s62, s7, 16 -; SI-NEXT: s_lshr_b32 s63, s8, 16 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_lshr_b32 s72, s6, 16 -; SI-NEXT: s_lshr_b32 s73, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s24 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_lshr_b32 s76, s5, 16 +; SI-NEXT: s_lshr_b32 s77, s7, 16 +; SI-NEXT: s_lshr_b32 s78, s9, 16 +; SI-NEXT: s_lshr_b32 s79, s11, 16 +; SI-NEXT: s_lshr_b32 s88, s13, 16 +; SI-NEXT: s_lshr_b32 s89, s15, 16 +; SI-NEXT: s_lshr_b32 s90, s17, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 16 +; SI-NEXT: s_lshr_b32 s92, s21, 16 +; SI-NEXT: s_lshr_b32 s93, s23, 16 +; SI-NEXT: s_lshr_b32 s94, s25, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v2, v48, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v48 -; SI-NEXT: v_or_b32_e32 v5, v5, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v38 -; SI-NEXT: v_or_b32_e32 v7, v7, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_or_b32_e32 v9, v34, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 -; SI-NEXT: v_or_b32_e32 v11, v32, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; SI-NEXT: v_or_b32_e32 v13, v30, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; SI-NEXT: v_or_b32_e32 v15, v28, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 -; SI-NEXT: v_or_b32_e32 v17, v26, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v19, v24, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v24 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v3, v50, v3 -; SI-NEXT: v_or_b32_e32 v4, v39, v4 -; SI-NEXT: v_or_b32_e32 v6, v37, v6 -; SI-NEXT: v_or_b32_e32 v8, v35, v8 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v12, v31, v12 -; SI-NEXT: v_or_b32_e32 v14, v29, v14 -; SI-NEXT: v_or_b32_e32 v16, v27, v16 -; SI-NEXT: v_or_b32_e32 v18, v25, v18 -; SI-NEXT: v_or_b32_e32 v20, v23, v20 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_lshl_b32 s27, s72, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s27 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s27, s94, 16 +; SI-NEXT: s_or_b32 s25, s25, s27 +; SI-NEXT: s_lshl_b32 s27, s62, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s27 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s27, s93, 16 +; SI-NEXT: s_or_b32 s23, s23, s27 +; SI-NEXT: s_lshl_b32 s27, s60, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s27 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s27, s92, 16 +; SI-NEXT: s_or_b32 s21, s21, s27 +; SI-NEXT: s_lshl_b32 s27, s58, 16 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s18, s18, s27 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s27, s91, 16 +; SI-NEXT: s_or_b32 s19, s19, s27 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s27, s56, 16 +; SI-NEXT: s_or_b32 s16, s16, s27 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s27, s90, 16 +; SI-NEXT: s_or_b32 s17, s17, s27 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s27, s46, 16 +; SI-NEXT: s_or_b32 s14, s14, s27 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s27, s89, 16 +; SI-NEXT: s_or_b32 s15, s15, s27 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s27, s44, 16 +; SI-NEXT: s_or_b32 s12, s12, s27 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s27, s88, 16 +; SI-NEXT: s_or_b32 s13, s13, s27 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s27, s42, 16 +; SI-NEXT: s_or_b32 s10, s10, s27 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s27, s79, 16 +; SI-NEXT: s_or_b32 s11, s11, s27 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s27, s40, 16 +; SI-NEXT: s_or_b32 s8, s8, s27 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s27, s78, 16 +; SI-NEXT: s_or_b32 s9, s9, s27 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s27, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s6, s6, s27 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s27, s77, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s76, 16 +; SI-NEXT: s_or_b32 s7, s7, s27 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_mov_b32_e32 v1, s25 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: v_mov_b32_e32 v3, s23 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: v_mov_b32_e32 v11, s15 +; SI-NEXT: v_mov_b32_e32 v12, s12 +; SI-NEXT: v_mov_b32_e32 v13, s13 +; SI-NEXT: v_mov_b32_e32 v14, s10 +; SI-NEXT: v_mov_b32_e32 v15, s11 +; SI-NEXT: v_mov_b32_e32 v16, s8 +; SI-NEXT: v_mov_b32_e32 v17, s9 +; SI-NEXT: v_mov_b32_e32 v18, s6 +; SI-NEXT: v_mov_b32_e32 v19, s7 +; SI-NEXT: v_mov_b32_e32 v20, s4 +; SI-NEXT: v_mov_b32_e32 v21, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v11i64_to_v44f16_scalar: @@ -23735,57 +22301,6 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v44f16_to_v11i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -23802,142 +22317,137 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_mov_b32_e32 v45, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v34, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v36, v17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v11 +; SI-NEXT: v_mov_b32_e32 v51, v10 +; SI-NEXT: v_mov_b32_e32 v52, v9 +; SI-NEXT: v_mov_b32_e32 v53, v8 +; SI-NEXT: v_mov_b32_e32 v54, v7 +; SI-NEXT: v_mov_b32_e32 v55, v6 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v42, v3 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v45 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v59 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v58 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v53 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v62, v6 -; SI-NEXT: v_or_b32_e32 v7, v60, v7 -; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_or_b32_e32 v9, v56, v9 -; SI-NEXT: v_or_b32_e32 v10, v46, v10 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_or_b32_e32 v13, v40, v13 -; SI-NEXT: v_or_b32_e32 v14, v54, v14 -; SI-NEXT: v_or_b32_e32 v15, v52, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v49 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -23947,45 +22457,58 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v38 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -24015,10 +22538,10 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -24031,10 +22554,10 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -24043,168 +22566,170 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v45 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -24823,196 +23348,126 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-LABEL: bitcast_v44f16_to_v11i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s40 -; SI-NEXT: s_lshr_b32 s40, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s19 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s21 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v34, v5 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v36, v3 +; SI-NEXT: v_mov_b32_e32 v37, v2 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v39, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v38 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v39 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_or_b32_e32 v2, v21, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_or_b32_e32 v4, v56, v4 -; SI-NEXT: v_or_b32_e32 v5, v50, v5 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v7, v38, v7 -; SI-NEXT: v_or_b32_e32 v8, v37, v8 -; SI-NEXT: v_or_b32_e32 v9, v34, v9 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v44, v12 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v54, v15 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: v_or_b32_e32 v18, v28, v18 -; SI-NEXT: v_or_b32_e32 v19, v26, v19 -; SI-NEXT: v_or_b32_e32 v20, v24, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -25025,167 +23480,160 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v51 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v32 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 @@ -25193,111 +23641,9 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v51, v33 -; SI-NEXT: v_mov_b32_e32 v50, v32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v48, v47 -; SI-NEXT: v_mov_b32_e32 v47, v53 -; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v39, v56 -; SI-NEXT: v_mov_b32_e32 v56, v54 -; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v38, v57 -; SI-NEXT: v_mov_b32_e32 v57, v55 -; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v40 -; SI-NEXT: v_mov_b32_e32 v40, v26 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v36, v59 -; SI-NEXT: v_mov_b32_e32 v59, v41 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v42 -; SI-NEXT: v_mov_b32_e32 v42, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v61, v43 -; SI-NEXT: v_mov_b32_e32 v43, v29 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v44 -; SI-NEXT: v_mov_b32_e32 v44, v30 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v45 -; SI-NEXT: v_mov_b32_e32 v45, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v45 -; SI-NEXT: v_mov_b32_e32 v45, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v30, v44 -; SI-NEXT: v_mov_b32_e32 v44, v62 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v29, v43 -; SI-NEXT: v_mov_b32_e32 v43, v61 -; SI-NEXT: v_mov_b32_e32 v61, v34 -; SI-NEXT: v_mov_b32_e32 v28, v42 -; SI-NEXT: v_mov_b32_e32 v42, v60 -; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: v_mov_b32_e32 v27, v41 -; SI-NEXT: v_mov_b32_e32 v41, v59 -; SI-NEXT: v_mov_b32_e32 v59, v36 -; SI-NEXT: v_mov_b32_e32 v26, v40 -; SI-NEXT: v_mov_b32_e32 v40, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v25, v55 -; SI-NEXT: v_mov_b32_e32 v55, v57 -; SI-NEXT: v_mov_b32_e32 v57, v38 -; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v54, v56 -; SI-NEXT: v_mov_b32_e32 v56, v39 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v53, v47 -; SI-NEXT: v_mov_b32_e32 v47, v48 -; SI-NEXT: v_mov_b32_e32 v32, v50 -; SI-NEXT: v_mov_b32_e32 v33, v51 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v44f16_to_v11i64_scalar: @@ -28984,154 +27330,54 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB52_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v22 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v29, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v34, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 @@ -29145,188 +27391,98 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v23, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v25, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v29, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v34, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v47 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v40 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v51 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v50 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v29 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v28 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v10, v10, v27 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v26 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v35 +; SI-NEXT: v_or_b32_e32 v16, v16, v24 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v18, v18, v23 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v27 +; SI-NEXT: v_or_b32_e32 v13, v13, v26 +; SI-NEXT: v_or_b32_e32 v15, v15, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v24 +; SI-NEXT: v_or_b32_e32 v19, v19, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f64_to_v44f16: @@ -29745,354 +27901,185 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_mov_b32_e32 v20, s16 -; SI-NEXT: v_mov_b32_e32 v21, s17 -; SI-NEXT: v_mov_b32_e32 v18, s18 -; SI-NEXT: v_mov_b32_e32 v19, s19 -; SI-NEXT: v_mov_b32_e32 v16, s20 -; SI-NEXT: v_mov_b32_e32 v17, s21 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: v_mov_b32_e32 v10, s24 -; SI-NEXT: v_mov_b32_e32 v11, s25 -; SI-NEXT: v_mov_b32_e32 v12, s26 -; SI-NEXT: v_mov_b32_e32 v13, s27 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v10, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v8, s28 -; SI-NEXT: v_mov_b32_e32 v9, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v18 +; SI-NEXT: v_lshr_b64 v[30:31], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[9:10], 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 -; SI-NEXT: s_cbranch_execnz .LBB53_3 -; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v23 +; SI-NEXT: v_lshr_b64 v[38:39], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[22:23], 16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_lshr_b64 v[30:31], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_lshr_b64 v[31:32], v[4:5], 16 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_lshr_b64 v[32:33], v[2:3], 16 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_lshr_b64 v[33:34], v[0:1], 16 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_lshr_b64 v[34:35], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v23 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v60 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v24, v22, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v26, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; SI-NEXT: v_or_b32_e32 v27, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v28, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v54 +; SI-NEXT: v_or_b32_e32 v29, v18, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v22, v16, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v26 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v23, v16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v53 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v52 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v24 +; SI-NEXT: v_mov_b32_e32 v1, v25 +; SI-NEXT: v_mov_b32_e32 v2, v26 +; SI-NEXT: v_mov_b32_e32 v3, v27 +; SI-NEXT: v_mov_b32_e32 v4, v28 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: v_mov_b32_e32 v6, v22 +; SI-NEXT: v_mov_b32_e32 v7, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v11f64_to_v44f16_scalar: @@ -30710,65 +28697,14 @@ cmp.false: br label %end end: - %phi = phi <44 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <44 x half> %phi -} - -define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { -; SI-LABEL: bitcast_v44f16_to_v11f64: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 + %phi = phi <44 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x half> %phi +} + +define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v44f16_to_v11f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -30785,142 +28721,137 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_mov_b32_e32 v45, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v34, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_mov_b32_e32 v36, v17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v11 +; SI-NEXT: v_mov_b32_e32 v51, v10 +; SI-NEXT: v_mov_b32_e32 v52, v9 +; SI-NEXT: v_mov_b32_e32 v53, v8 +; SI-NEXT: v_mov_b32_e32 v54, v7 +; SI-NEXT: v_mov_b32_e32 v55, v6 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v42, v3 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v45 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v59 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v58 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v53 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v5, v32, v5 -; SI-NEXT: v_or_b32_e32 v6, v62, v6 -; SI-NEXT: v_or_b32_e32 v7, v60, v7 -; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_or_b32_e32 v9, v56, v9 -; SI-NEXT: v_or_b32_e32 v10, v46, v10 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_or_b32_e32 v13, v40, v13 -; SI-NEXT: v_or_b32_e32 v14, v54, v14 -; SI-NEXT: v_or_b32_e32 v15, v52, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v49 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -30930,45 +28861,58 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v38 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -30998,10 +28942,10 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -31014,10 +28958,10 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -31026,168 +28970,170 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v45 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -31806,369 +29752,292 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-LABEL: bitcast_v44f16_to_v11f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s40 -; SI-NEXT: s_lshr_b32 s40, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s17 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s19 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s21 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s23 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s27 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v34, v5 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v36, v3 +; SI-NEXT: v_mov_b32_e32 v37, v2 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v39, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v38 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v39 ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_or_b32_e32 v2, v21, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_or_b32_e32 v4, v56, v4 -; SI-NEXT: v_or_b32_e32 v5, v50, v5 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v7, v38, v7 -; SI-NEXT: v_or_b32_e32 v8, v37, v8 -; SI-NEXT: v_or_b32_e32 v9, v34, v9 -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v44, v12 -; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v54, v15 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: v_or_b32_e32 v18, v28, v18 -; SI-NEXT: v_or_b32_e32 v19, v26, v19 -; SI-NEXT: v_or_b32_e32 v20, v24, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v51 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v32 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 @@ -32176,111 +30045,9 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v51, v33 -; SI-NEXT: v_mov_b32_e32 v50, v32 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v48, v47 -; SI-NEXT: v_mov_b32_e32 v47, v53 -; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v39, v56 -; SI-NEXT: v_mov_b32_e32 v56, v54 -; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v38, v57 -; SI-NEXT: v_mov_b32_e32 v57, v55 -; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v40 -; SI-NEXT: v_mov_b32_e32 v40, v26 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v36, v59 -; SI-NEXT: v_mov_b32_e32 v59, v41 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v42 -; SI-NEXT: v_mov_b32_e32 v42, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v61, v43 -; SI-NEXT: v_mov_b32_e32 v43, v29 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v44 -; SI-NEXT: v_mov_b32_e32 v44, v30 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v45 -; SI-NEXT: v_mov_b32_e32 v45, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v45 -; SI-NEXT: v_mov_b32_e32 v45, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v30, v44 -; SI-NEXT: v_mov_b32_e32 v44, v62 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v29, v43 -; SI-NEXT: v_mov_b32_e32 v43, v61 -; SI-NEXT: v_mov_b32_e32 v61, v34 -; SI-NEXT: v_mov_b32_e32 v28, v42 -; SI-NEXT: v_mov_b32_e32 v42, v60 -; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: v_mov_b32_e32 v27, v41 -; SI-NEXT: v_mov_b32_e32 v41, v59 -; SI-NEXT: v_mov_b32_e32 v59, v36 -; SI-NEXT: v_mov_b32_e32 v26, v40 -; SI-NEXT: v_mov_b32_e32 v40, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v25, v55 -; SI-NEXT: v_mov_b32_e32 v55, v57 -; SI-NEXT: v_mov_b32_e32 v57, v38 -; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v54, v56 -; SI-NEXT: v_mov_b32_e32 v56, v39 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v53, v47 -; SI-NEXT: v_mov_b32_e32 v47, v48 -; SI-NEXT: v_mov_b32_e32 v32, v50 -; SI-NEXT: v_mov_b32_e32 v33, v51 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v44f16_to_v11f64_scalar: @@ -32827,7 +30594,14 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v44i16_to_v44f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -32867,34 +30641,26 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -32913,441 +30679,403 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v50 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v28 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v53 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: .LBB56_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB56_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; SI-NEXT: v_or_b32_e32 v42, v1, v22 +; SI-NEXT: v_alignbit_b32 v1, v42, v46, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_or_b32_e32 v40, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v40, v47, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v54, v1, v62 +; SI-NEXT: v_alignbit_b32 v1, v54, v56, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v52, v1, v32 +; SI-NEXT: v_alignbit_b32 v1, v52, v59, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 +; SI-NEXT: v_or_b32_e32 v49, v1, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_alignbit_b32 v1, v49, v61, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v38, v1, v53 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_alignbit_b32 v1, v38, v43, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_or_b32_e32 v35, v1, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_alignbit_b32 v1, v35, v36, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v33, v1, v45 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_alignbit_b32 v1, v33, v50, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: v_or_b32_e32 v30, v1, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_alignbit_b32 v1, v30, v55, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v28, v1, v57 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: v_alignbit_b32 v1, v28, v58, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: v_or_b32_e32 v22, v1, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_alignbit_b32 v1, v22, v44, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v44 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v44, v20 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v20, v60, v20 +; SI-NEXT: v_or_b32_e32 v18, v58, v18 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v18, v57, v18 +; SI-NEXT: v_or_b32_e32 v16, v55, v16 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v16, v63, v16 +; SI-NEXT: v_or_b32_e32 v14, v50, v14 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v12, v36, v12 +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v12, v41, v12 +; SI-NEXT: v_or_b32_e32 v10, v43, v10 +; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v10, v53, v10 +; SI-NEXT: v_or_b32_e32 v8, v61, v8 +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v8, v39, v8 +; SI-NEXT: v_or_b32_e32 v6, v59, v6 +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v6, v32, v6 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v2, v47, v2 +; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v40 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v0, v42, v2, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v40, v4, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v54, v6, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v52, v8, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v49, v10, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v38, v12, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v35, v14, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v33, v16, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v30, v18, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v39 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v28, v20, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v22, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v42 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v57 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v24 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v56 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v61 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v55 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v43 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -33364,11 +31092,60 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44i16_to_v44f16: @@ -33799,328 +31576,451 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-LABEL: bitcast_v44i16_to_v44f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v22, s30, 0 +; SI-NEXT: v_writelane_b32 v22, s31, 1 +; SI-NEXT: v_writelane_b32 v22, s34, 2 +; SI-NEXT: v_writelane_b32 v22, s35, 3 +; SI-NEXT: v_writelane_b32 v22, s36, 4 +; SI-NEXT: v_writelane_b32 v22, s37, 5 +; SI-NEXT: v_writelane_b32 v22, s38, 6 +; SI-NEXT: v_writelane_b32 v22, s39, 7 +; SI-NEXT: v_writelane_b32 v22, s48, 8 +; SI-NEXT: v_writelane_b32 v22, s49, 9 +; SI-NEXT: v_writelane_b32 v22, s50, 10 +; SI-NEXT: v_writelane_b32 v22, s51, 11 +; SI-NEXT: v_writelane_b32 v22, s52, 12 +; SI-NEXT: v_writelane_b32 v22, s53, 13 +; SI-NEXT: v_writelane_b32 v22, s54, 14 +; SI-NEXT: v_writelane_b32 v22, s55, 15 +; SI-NEXT: v_writelane_b32 v22, s64, 16 +; SI-NEXT: v_writelane_b32 v22, s65, 17 +; SI-NEXT: v_writelane_b32 v22, s66, 18 +; SI-NEXT: v_writelane_b32 v22, s67, 19 +; SI-NEXT: v_writelane_b32 v22, s68, 20 +; SI-NEXT: v_writelane_b32 v22, s69, 21 +; SI-NEXT: v_writelane_b32 v22, s70, 22 +; SI-NEXT: v_writelane_b32 v22, s71, 23 +; SI-NEXT: v_writelane_b32 v22, s80, 24 +; SI-NEXT: v_writelane_b32 v22, s81, 25 +; SI-NEXT: v_writelane_b32 v22, s82, 26 +; SI-NEXT: v_writelane_b32 v22, s83, 27 +; SI-NEXT: v_writelane_b32 v22, s84, 28 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: s_lshr_b32 s38, s29, 16 +; SI-NEXT: s_lshr_b32 s65, s28, 16 +; SI-NEXT: s_lshr_b32 s37, s27, 16 +; SI-NEXT: s_lshr_b32 s64, s26, 16 +; SI-NEXT: s_lshr_b32 s36, s25, 16 +; SI-NEXT: s_lshr_b32 s55, s24, 16 +; SI-NEXT: s_lshr_b32 s35, s23, 16 +; SI-NEXT: s_lshr_b32 s54, s22, 16 +; SI-NEXT: s_lshr_b32 s34, s21, 16 +; SI-NEXT: s_lshr_b32 s53, s20, 16 +; SI-NEXT: s_lshr_b32 s31, s19, 16 +; SI-NEXT: s_lshr_b32 s52, s18, 16 +; SI-NEXT: s_lshr_b32 s30, s17, 16 +; SI-NEXT: s_lshr_b32 s51, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; SI-NEXT: v_writelane_b32 v22, s85, 29 +; SI-NEXT: v_readfirstlane_b32 s82, v7 +; SI-NEXT: v_readfirstlane_b32 s84, v6 +; SI-NEXT: v_readfirstlane_b32 s71, v5 +; SI-NEXT: v_readfirstlane_b32 s81, v4 +; SI-NEXT: v_readfirstlane_b32 s68, v3 +; SI-NEXT: v_readfirstlane_b32 s70, v2 +; SI-NEXT: v_readfirstlane_b32 s66, v1 +; SI-NEXT: v_readfirstlane_b32 s67, v0 +; SI-NEXT: v_readfirstlane_b32 s50, v9 +; SI-NEXT: v_readfirstlane_b32 s85, v10 +; SI-NEXT: v_readfirstlane_b32 s49, v11 +; SI-NEXT: v_readfirstlane_b32 s83, v12 +; SI-NEXT: v_readfirstlane_b32 s48, v13 +; SI-NEXT: v_readfirstlane_b32 s80, v14 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; SI-NEXT: v_readfirstlane_b32 s39, v15 +; SI-NEXT: v_readfirstlane_b32 s69, v16 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v58 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s30, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s58, s51, 16 +; SI-NEXT: s_or_b32 s59, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s31, 16 +; SI-NEXT: s_or_b32 s12, s4, s58 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s60, s52, 16 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s34, 16 +; SI-NEXT: s_or_b32 s10, s4, s60 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s62, s53, 16 +; SI-NEXT: s_or_b32 s63, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s35, 16 +; SI-NEXT: s_or_b32 s8, s4, s62 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s72, s54, 16 +; SI-NEXT: s_or_b32 s73, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s36, 16 +; SI-NEXT: s_or_b32 s6, s4, s72 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s56, s55, 16 +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s7, s37, 16 +; SI-NEXT: s_or_b32 s4, s4, s56 +; SI-NEXT: s_lshl_b32 s46, s64, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s29, 0xffff +; SI-NEXT: s_lshl_b32 s7, s38, 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[56:57], 16 +; SI-NEXT: s_and_b32 s56, s26, 0xffff +; SI-NEXT: s_lshl_b32 s44, s65, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s66, 0xffff +; SI-NEXT: s_lshl_b32 s7, s39, 16 +; SI-NEXT: s_or_b32 s56, s56, s46 +; SI-NEXT: s_lshr_b64 s[76:77], s[46:47], 16 +; SI-NEXT: s_and_b32 s46, s28, 0xffff +; SI-NEXT: s_lshl_b32 s42, s69, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s68, 0xffff +; SI-NEXT: s_lshl_b32 s7, s48, 16 +; SI-NEXT: s_or_b32 s46, s46, s44 +; SI-NEXT: s_lshr_b64 s[78:79], s[44:45], 16 +; SI-NEXT: s_and_b32 s44, s67, 0xffff +; SI-NEXT: s_lshl_b32 s40, s80, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s71, 0xffff +; SI-NEXT: s_lshl_b32 s7, s49, 16 +; SI-NEXT: s_or_b32 s44, s44, s42 +; SI-NEXT: s_lshr_b64 s[88:89], s[42:43], 16 +; SI-NEXT: s_and_b32 s42, s70, 0xffff +; SI-NEXT: s_lshl_b32 s14, s83, 16 +; SI-NEXT: s_or_b32 s15, s5, s7 +; SI-NEXT: s_and_b32 s5, s82, 0xffff +; SI-NEXT: s_lshl_b32 s7, s50, 16 +; SI-NEXT: s_or_b32 s42, s42, s40 +; SI-NEXT: s_lshr_b64 s[90:91], s[40:41], 16 +; SI-NEXT: s_and_b32 s40, s81, 0xffff +; SI-NEXT: s_or_b32 s95, s5, s7 +; SI-NEXT: s_lshl_b32 s94, s85, 16 +; SI-NEXT: s_or_b32 s40, s40, s14 +; SI-NEXT: s_lshr_b64 s[92:93], s[14:15], 16 +; SI-NEXT: s_and_b32 s14, s84, 0xffff +; SI-NEXT: s_mov_b32 s13, s59 +; SI-NEXT: s_lshr_b64 s[58:59], s[58:59], 16 +; SI-NEXT: s_mov_b32 s11, s61 +; SI-NEXT: s_lshr_b64 s[60:61], s[60:61], 16 +; SI-NEXT: s_mov_b32 s9, s63 +; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 16 +; SI-NEXT: s_mov_b32 s7, s73 +; SI-NEXT: s_lshr_b64 s[72:73], s[72:73], 16 +; SI-NEXT: s_mov_b32 s5, s57 +; SI-NEXT: s_mov_b32 s57, s47 +; SI-NEXT: s_mov_b32 s47, s45 +; SI-NEXT: s_mov_b32 s45, s43 +; SI-NEXT: s_mov_b32 s43, s41 +; SI-NEXT: s_mov_b32 s41, s15 +; SI-NEXT: s_or_b32 s14, s14, s94 +; SI-NEXT: s_mov_b32 s15, s95 +; SI-NEXT: s_lshr_b64 s[94:95], s[94:95], 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v58 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v57 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v56 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v44 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s84, s84, 3 +; SI-NEXT: s_and_b32 s4, s84, 0xffff +; SI-NEXT: s_lshl_b32 s5, s85, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s82, s82, 3 +; SI-NEXT: s_add_i32 s14, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s82, 0xffff +; SI-NEXT: s_lshl_b32 s5, s50, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s81, s81, 3 +; SI-NEXT: s_add_i32 s15, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s81, 0xffff +; SI-NEXT: s_lshl_b32 s5, s83, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s71, s71, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s71, 0xffff +; SI-NEXT: s_lshl_b32 s5, s49, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s70, s70, 3 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s70, 0xffff +; SI-NEXT: s_lshl_b32 s5, s80, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s68, s68, 3 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s68, 0xffff +; SI-NEXT: s_lshl_b32 s5, s48, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s67, s67, 3 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s67, 0xffff +; SI-NEXT: s_lshl_b32 s5, s69, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s66, s66, 3 +; SI-NEXT: s_add_i32 s44, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s66, 0xffff +; SI-NEXT: s_lshl_b32 s5, s39, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s45, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s65, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s46, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s38, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s47, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s64, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s56, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s37, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s57, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s55, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s6, s36, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s22, 0xffff +; SI-NEXT: s_lshl_b32 s7, s54, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s23, 0xffff +; SI-NEXT: s_lshl_b32 s8, s35, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s53, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s10, s34, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s52, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s12, s31, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s16, 0xffff +; SI-NEXT: s_lshl_b32 s13, s51, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s30, 16 +; SI-NEXT: s_or_b32 s13, s16, s13 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[14:15], 16 +; SI-NEXT: s_lshr_b32 s30, s13, 16 +; SI-NEXT: s_lshr_b32 s31, s11, 16 +; SI-NEXT: s_lshr_b32 s34, s9, 16 +; SI-NEXT: s_lshr_b32 s35, s7, 16 +; SI-NEXT: s_lshr_b32 s36, s5, 16 +; SI-NEXT: s_lshr_b32 s37, s57, 16 +; SI-NEXT: s_lshr_b32 s38, s47, 16 +; SI-NEXT: s_lshr_b32 s39, s45, 16 +; SI-NEXT: s_lshr_b32 s48, s43, 16 +; SI-NEXT: s_lshr_b32 s49, s41, 16 +; SI-NEXT: s_lshr_b32 s50, s15, 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v27 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v31 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v32 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v53 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v36 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v55 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v42 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v43 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v41 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v21, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v49 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v37 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s16, s58, 16 +; SI-NEXT: s_or_b32 s12, s12, s16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s16, s30, 16 +; SI-NEXT: s_or_b32 s13, s13, s16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s16, s60, 16 +; SI-NEXT: s_or_b32 s10, s10, s16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s16, s31, 16 +; SI-NEXT: s_or_b32 s11, s11, s16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s16, s62, 16 +; SI-NEXT: s_or_b32 s8, s8, s16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s16, s34, 16 +; SI-NEXT: s_or_b32 s9, s9, s16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s16, s72, 16 +; SI-NEXT: s_or_b32 s6, s6, s16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s16, s35, 16 +; SI-NEXT: s_or_b32 s7, s7, s16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s16, s74, 16 +; SI-NEXT: s_or_b32 s4, s4, s16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s16, s36, 16 +; SI-NEXT: s_or_b32 s5, s5, s16 +; SI-NEXT: s_and_b32 s16, s56, 0xffff +; SI-NEXT: s_lshl_b32 s17, s76, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s57, 0xffff +; SI-NEXT: s_lshl_b32 s18, s37, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s46, 0xffff +; SI-NEXT: s_lshl_b32 s19, s78, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s47, 0xffff +; SI-NEXT: s_lshl_b32 s20, s38, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s44, 0xffff +; SI-NEXT: s_lshl_b32 s21, s88, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_and_b32 s21, s45, 0xffff +; SI-NEXT: s_lshl_b32 s22, s39, 16 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_and_b32 s22, s42, 0xffff +; SI-NEXT: s_lshl_b32 s23, s90, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_and_b32 s23, s43, 0xffff +; SI-NEXT: s_lshl_b32 s24, s48, 16 +; SI-NEXT: s_or_b32 s23, s23, s24 +; SI-NEXT: s_and_b32 s24, s40, 0xffff +; SI-NEXT: s_lshl_b32 s25, s92, 16 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_and_b32 s25, s41, 0xffff +; SI-NEXT: s_lshl_b32 s26, s49, 16 +; SI-NEXT: s_or_b32 s25, s25, s26 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s26, s94, 16 +; SI-NEXT: s_or_b32 s14, s14, s26 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s26, s50, 16 +; SI-NEXT: s_or_b32 s15, s15, s26 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s5 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s19 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_mov_b32_e32 v20, s14 +; SI-NEXT: v_mov_b32_e32 v21, s15 +; SI-NEXT: v_readlane_b32 s85, v22, 29 +; SI-NEXT: v_readlane_b32 s84, v22, 28 +; SI-NEXT: v_readlane_b32 s83, v22, 27 +; SI-NEXT: v_readlane_b32 s82, v22, 26 +; SI-NEXT: v_readlane_b32 s81, v22, 25 +; SI-NEXT: v_readlane_b32 s80, v22, 24 +; SI-NEXT: v_readlane_b32 s71, v22, 23 +; SI-NEXT: v_readlane_b32 s70, v22, 22 +; SI-NEXT: v_readlane_b32 s69, v22, 21 +; SI-NEXT: v_readlane_b32 s68, v22, 20 +; SI-NEXT: v_readlane_b32 s67, v22, 19 +; SI-NEXT: v_readlane_b32 s66, v22, 18 +; SI-NEXT: v_readlane_b32 s65, v22, 17 +; SI-NEXT: v_readlane_b32 s64, v22, 16 +; SI-NEXT: v_readlane_b32 s55, v22, 15 +; SI-NEXT: v_readlane_b32 s54, v22, 14 +; SI-NEXT: v_readlane_b32 s53, v22, 13 +; SI-NEXT: v_readlane_b32 s52, v22, 12 +; SI-NEXT: v_readlane_b32 s51, v22, 11 +; SI-NEXT: v_readlane_b32 s50, v22, 10 +; SI-NEXT: v_readlane_b32 s49, v22, 9 +; SI-NEXT: v_readlane_b32 s48, v22, 8 +; SI-NEXT: v_readlane_b32 s39, v22, 7 +; SI-NEXT: v_readlane_b32 s38, v22, 6 +; SI-NEXT: v_readlane_b32 s37, v22, 5 +; SI-NEXT: v_readlane_b32 s36, v22, 4 +; SI-NEXT: v_readlane_b32 s35, v22, 3 +; SI-NEXT: v_readlane_b32 s34, v22, 2 +; SI-NEXT: v_readlane_b32 s31, v22, 1 +; SI-NEXT: v_readlane_b32 s30, v22, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v44i16_to_v44f16_scalar: @@ -34826,376 +32726,288 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v44f16_to_v44i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v52 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_or_b32_e32 v21, v21, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_or_b32_e32 v19, v19, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v17, v17, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v15, v15, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: v_or_b32_e32 v13, v13, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_or_b32_e32 v11, v11, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 ; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 ; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_or_b32_e32 v9, v9, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_or_b32_e32 v7, v7, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v5, v5, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v36 ; SI-NEXT: v_or_b32_e32 v3, v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 ; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v0, v0, v51 -; SI-NEXT: v_or_b32_e32 v33, v33, v50 -; SI-NEXT: v_or_b32_e32 v32, v32, v49 -; SI-NEXT: v_or_b32_e32 v31, v31, v48 -; SI-NEXT: v_or_b32_e32 v30, v30, v39 -; SI-NEXT: v_or_b32_e32 v28, v28, v38 -; SI-NEXT: v_or_b32_e32 v27, v27, v37 -; SI-NEXT: v_or_b32_e32 v26, v26, v36 -; SI-NEXT: v_or_b32_e32 v24, v24, v35 -; SI-NEXT: v_or_b32_e32 v25, v25, v34 -; SI-NEXT: v_or_b32_e32 v23, v23, v29 -; SI-NEXT: v_alignbit_b32 v51, v1, v51, 16 -; SI-NEXT: v_alignbit_b32 v50, v3, v50, 16 -; SI-NEXT: v_alignbit_b32 v49, v5, v49, 16 -; SI-NEXT: v_alignbit_b32 v48, v7, v48, 16 -; SI-NEXT: v_alignbit_b32 v39, v9, v39, 16 -; SI-NEXT: v_alignbit_b32 v38, v11, v38, 16 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v51 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 +; SI-NEXT: v_or_b32_e32 v6, v6, v49 +; SI-NEXT: v_or_b32_e32 v8, v8, v48 +; SI-NEXT: v_or_b32_e32 v10, v10, v39 +; SI-NEXT: v_or_b32_e32 v12, v12, v37 +; SI-NEXT: v_or_b32_e32 v14, v14, v35 +; SI-NEXT: v_or_b32_e32 v16, v16, v32 +; SI-NEXT: v_or_b32_e32 v18, v18, v29 +; SI-NEXT: v_or_b32_e32 v20, v20, v26 +; SI-NEXT: v_alignbit_b32 v22, v1, v22, 16 +; SI-NEXT: v_alignbit_b32 v51, v3, v51, 16 +; SI-NEXT: v_alignbit_b32 v50, v5, v50, 16 +; SI-NEXT: v_alignbit_b32 v49, v7, v49, 16 +; SI-NEXT: v_alignbit_b32 v48, v9, v48, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v39, 16 ; SI-NEXT: v_alignbit_b32 v37, v13, v37, 16 -; SI-NEXT: v_alignbit_b32 v36, v15, v36, 16 -; SI-NEXT: v_alignbit_b32 v35, v17, v35, 16 -; SI-NEXT: v_alignbit_b32 v34, v19, v34, 16 -; SI-NEXT: v_alignbit_b32 v29, v21, v29, 16 +; SI-NEXT: v_alignbit_b32 v35, v15, v35, 16 +; SI-NEXT: v_alignbit_b32 v32, v17, v32, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v29, 16 +; SI-NEXT: v_alignbit_b32 v26, v21, v26, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v22 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; SI-NEXT: v_or_b32_e32 v2, v2, v22 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v22 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v22 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v22 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; SI-NEXT: v_or_b32_e32 v8, v8, v22 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v9, v9, v22 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v39 +; SI-NEXT: v_or_b32_e32 v10, v10, v22 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v11, v11, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v37 +; SI-NEXT: v_or_b32_e32 v12, v12, v22 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v28 +; SI-NEXT: v_or_b32_e32 v13, v13, v22 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v35 +; SI-NEXT: v_or_b32_e32 v14, v14, v22 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; SI-NEXT: v_or_b32_e32 v15, v15, v22 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; SI-NEXT: v_or_b32_e32 v16, v16, v22 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v22 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_or_b32_e32 v18, v18, v22 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v50 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v49 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v48 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v39 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 -; SI-NEXT: v_or_b32_e32 v16, v16, v24 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v34 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; SI-NEXT: v_or_b32_e32 v19, v19, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v0, v51 -; SI-NEXT: v_or_b32_e32 v2, v2, v33 -; SI-NEXT: v_or_b32_e32 v4, v4, v32 -; SI-NEXT: v_or_b32_e32 v6, v6, v31 -; SI-NEXT: v_or_b32_e32 v8, v8, v30 -; SI-NEXT: v_or_b32_e32 v10, v10, v28 -; SI-NEXT: v_or_b32_e32 v12, v12, v27 -; SI-NEXT: v_or_b32_e32 v14, v14, v26 -; SI-NEXT: v_or_b32_e32 v18, v18, v24 -; SI-NEXT: v_or_b32_e32 v20, v20, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -35603,441 +33415,391 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v17, v48, v17, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v50, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v51, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v52, v21, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = fadd <44 x half> %a, splat (half 0xH0200) - %a2 = bitcast <44 x half> %a1 to <44 x i16> - br label %end - -cmp.false: - %a3 = bitcast <44 x half> %a to <44 x i16> - br label %end - -end: - %phi = phi <44 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <44 x i16> %phi -} - -define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i32 inreg %b) { -; SI-LABEL: bitcast_v44f16_to_v44i16_scalar: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_lshr_b32 s14, s21, 16 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 -; SI-NEXT: s_lshr_b32 s12, s23, 16 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 -; SI-NEXT: s_lshr_b32 s6, s29, 16 -; SI-NEXT: s_lshr_b32 s8, s27, 16 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s28 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: s_lshr_b32 s7, s28, 16 -; SI-NEXT: s_lshr_b32 s9, s26, 16 -; SI-NEXT: s_lshr_b32 s11, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s22, 16 -; SI-NEXT: s_lshr_b32 s15, s20, 16 -; SI-NEXT: s_lshr_b32 s40, s18, 16 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v50, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v51, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v52, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <44 x half> %a, splat (half 0xH0200) + %a2 = bitcast <44 x half> %a1 to <44 x i16> + br label %end + +cmp.false: + %a3 = bitcast <44 x half> %a to <44 x i16> + br label %end + +end: + %phi = phi <44 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x i16> %phi +} + +define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44f16_to_v44i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v19, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v20 +; SI-NEXT: s_lshr_b32 s7, s29, 16 +; SI-NEXT: s_lshr_b32 s6, s28, 16 +; SI-NEXT: s_lshr_b32 s13, s27, 16 +; SI-NEXT: s_lshr_b32 s14, s26, 16 +; SI-NEXT: s_lshr_b32 s12, s25, 16 +; SI-NEXT: s_lshr_b32 s15, s24, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 +; SI-NEXT: s_cbranch_scc0 .LBB59_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: s_cbranch_execnz .LBB59_4 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v20 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v40 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v44 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s40 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s19 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s15 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s13 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_or_b32_e32 v29, v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v6 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 +; SI-NEXT: v_or_b32_e32 v23, v2, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v13, v9, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v11 +; SI-NEXT: v_or_b32_e32 v31, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v9, v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v24 -; SI-NEXT: v_or_b32_e32 v44, v22, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v26 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_or_b32_e32 v33, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s16 +; SI-NEXT: v_or_b32_e32 v11, v6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_or_b32_e32 v56, v4, v10 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v6 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v40 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_or_b32_e32 v27, v22, v4 -; SI-NEXT: v_or_b32_e32 v56, v24, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v63 -; SI-NEXT: v_or_b32_e32 v26, v23, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v29 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v60, v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v14 +; SI-NEXT: v_or_b32_e32 v58, v6, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v43, v22, v10 -; SI-NEXT: v_or_b32_e32 v63, v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v60, v23, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 +; SI-NEXT: v_or_b32_e32 v5, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v54 +; SI-NEXT: v_or_b32_e32 v57, v15, v22 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s26 +; SI-NEXT: v_or_b32_e32 v7, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v62, v15, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_or_b32_e32 v61, v16, v30 +; SI-NEXT: v_or_b32_e32 v59, v14, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v62, v22, v16 -; SI-NEXT: v_or_b32_e32 v58, v24, v18 -; SI-NEXT: v_or_b32_e32 v22, v25, v20 -; SI-NEXT: v_lshr_b64 v[28:29], v[14:15], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[18:19], 16 -; SI-NEXT: v_or_b32_e32 v54, v23, v14 -; SI-NEXT: v_lshr_b64 v[50:51], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[2:3], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[4:5], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[6:7], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[10:11], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[12:13], 16 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_lshr_b64 v[60:61], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v25, v22 -; SI-NEXT: v_lshr_b64 v[22:23], v[20:21], 16 -; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshr_b64 v[26:27], v[12:13], 16 +; SI-NEXT: v_or_b32_e32 v19, v15, v0 +; SI-NEXT: v_or_b32_e32 v18, v14, v2 +; SI-NEXT: v_or_b32_e32 v25, v16, v4 +; SI-NEXT: v_or_b32_e32 v20, v17, v6 +; SI-NEXT: v_lshr_b64 v[52:53], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[30:31], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[6:7], 16 +; SI-NEXT: s_branch .LBB59_5 +; SI-NEXT: .LBB59_3: +; SI-NEXT: s_branch .LBB59_2 +; SI-NEXT: .LBB59_4: +; SI-NEXT: v_mov_b32_e32 v46, s7 +; SI-NEXT: v_mov_b32_e32 v42, s13 +; SI-NEXT: v_mov_b32_e32 v43, s12 +; SI-NEXT: v_mov_b32_e32 v44, s11 +; SI-NEXT: v_mov_b32_e32 v45, s10 +; SI-NEXT: v_mov_b32_e32 v47, s9 +; SI-NEXT: v_mov_b32_e32 v24, s8 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v59, s28 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v61, s26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, s24 +; SI-NEXT: v_mov_b32_e32 v57, s22 +; SI-NEXT: v_mov_b32_e32 v58, s20 +; SI-NEXT: v_mov_b32_e32 v60, s18 +; SI-NEXT: v_mov_b32_e32 v56, s16 +; SI-NEXT: v_mov_b32_e32 v23, s23 +; SI-NEXT: v_mov_b32_e32 v29, s25 +; SI-NEXT: v_mov_b32_e32 v31, s27 +; SI-NEXT: v_mov_b32_e32 v33, s29 +; SI-NEXT: v_mov_b32_e32 v52, s43 +; SI-NEXT: v_mov_b32_e32 v50, s42 +; SI-NEXT: v_mov_b32_e32 v26, s41 +; SI-NEXT: v_mov_b32_e32 v48, s40 +; SI-NEXT: v_mov_b32_e32 v38, s15 +; SI-NEXT: v_mov_b32_e32 v36, s14 +; SI-NEXT: v_mov_b32_e32 v34, s6 +; SI-NEXT: .LBB59_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v22, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v58 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v45 +; SI-NEXT: v_or_b32_e32 v26, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v57 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 +; SI-NEXT: v_or_b32_e32 v23, v8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v36 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v42 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v45 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v28 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v60 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v40 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v58 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v24 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v53 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 +; SI-NEXT: v_or_b32_e32 v15, v1, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v1, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_or_b32_e32 v18, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v19, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_or_b32_e32 v20, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v21, v1, v3 +; SI-NEXT: v_mov_b32_e32 v1, v24 +; SI-NEXT: v_mov_b32_e32 v3, v22 +; SI-NEXT: v_mov_b32_e32 v5, v26 +; SI-NEXT: v_mov_b32_e32 v7, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v44f16_to_v44i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll index 1ff6bbd4e9a37..322689c91425b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -5952,433 +5952,186 @@ end: define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v24i32_to_v48f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v36, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v49, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_mov_b32_e32 v33, v22 -; SI-NEXT: v_mov_b32_e32 v32, v23 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v36, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v49, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v61 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v57 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v45 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v46 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v53 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v54 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v34 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v32 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v39 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v33 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v38 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v54 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v52 +; SI-NEXT: v_or_b32_e32 v8, v8, v31 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v10, v10, v30 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v50 +; SI-NEXT: v_or_b32_e32 v12, v12, v29 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v14, v14, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v16, v16, v27 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_or_b32_e32 v18, v18, v26 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v20, v20, v25 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v29 +; SI-NEXT: v_or_b32_e32 v15, v15, v28 +; SI-NEXT: v_or_b32_e32 v17, v17, v27 +; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i32_to_v48f16: @@ -6874,129 +6627,86 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; SI-LABEL: bitcast_v24i32_to_v48f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v24, s30, 0 ; SI-NEXT: v_mov_b32_e32 v11, s16 ; SI-NEXT: v_mov_b32_e32 v12, s17 ; SI-NEXT: v_mov_b32_e32 v13, s18 ; SI-NEXT: v_mov_b32_e32 v14, s19 ; SI-NEXT: v_mov_b32_e32 v15, s20 +; SI-NEXT: v_writelane_b32 v24, s31, 1 ; SI-NEXT: v_mov_b32_e32 v16, s21 ; SI-NEXT: v_mov_b32_e32 v17, s22 ; SI-NEXT: v_mov_b32_e32 v18, s23 ; SI-NEXT: v_mov_b32_e32 v19, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v11 +; SI-NEXT: v_readfirstlane_b32 s40, v11 ; SI-NEXT: v_mov_b32_e32 v11, s25 -; SI-NEXT: v_readfirstlane_b32 s25, v12 +; SI-NEXT: v_readfirstlane_b32 s41, v12 ; SI-NEXT: v_mov_b32_e32 v12, s26 -; SI-NEXT: v_readfirstlane_b32 s26, v13 +; SI-NEXT: v_readfirstlane_b32 s24, v13 ; SI-NEXT: v_mov_b32_e32 v13, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v14 +; SI-NEXT: v_readfirstlane_b32 s25, v14 ; SI-NEXT: v_mov_b32_e32 v14, s28 -; SI-NEXT: v_readfirstlane_b32 s28, v15 +; SI-NEXT: v_readfirstlane_b32 s22, v15 ; SI-NEXT: v_mov_b32_e32 v15, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: v_readfirstlane_b32 s29, v16 -; SI-NEXT: v_readfirstlane_b32 s23, v17 -; SI-NEXT: v_readfirstlane_b32 s22, v18 -; SI-NEXT: v_readfirstlane_b32 s21, v19 -; SI-NEXT: v_readfirstlane_b32 s20, v11 -; SI-NEXT: v_readfirstlane_b32 s19, v12 -; SI-NEXT: v_readfirstlane_b32 s18, v13 -; SI-NEXT: v_readfirstlane_b32 s17, v14 -; SI-NEXT: v_readfirstlane_b32 s16, v15 -; SI-NEXT: v_readfirstlane_b32 s15, v0 -; SI-NEXT: v_readfirstlane_b32 s14, v1 -; SI-NEXT: v_readfirstlane_b32 s13, v2 -; SI-NEXT: v_readfirstlane_b32 s12, v3 -; SI-NEXT: v_readfirstlane_b32 s11, v4 -; SI-NEXT: v_readfirstlane_b32 s10, v5 -; SI-NEXT: v_readfirstlane_b32 s8, v6 +; SI-NEXT: v_writelane_b32 v24, s34, 2 +; SI-NEXT: v_readfirstlane_b32 s23, v16 +; SI-NEXT: v_readfirstlane_b32 s20, v17 +; SI-NEXT: v_readfirstlane_b32 s21, v18 +; SI-NEXT: v_readfirstlane_b32 s18, v19 +; SI-NEXT: v_readfirstlane_b32 s19, v11 +; SI-NEXT: v_readfirstlane_b32 s16, v12 +; SI-NEXT: v_readfirstlane_b32 s17, v13 +; SI-NEXT: v_readfirstlane_b32 s14, v14 +; SI-NEXT: v_readfirstlane_b32 s15, v15 +; SI-NEXT: v_readfirstlane_b32 s12, v0 +; SI-NEXT: v_readfirstlane_b32 s13, v1 +; SI-NEXT: v_readfirstlane_b32 s10, v2 +; SI-NEXT: v_readfirstlane_b32 s11, v3 +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_readfirstlane_b32 s9, v5 +; SI-NEXT: v_readfirstlane_b32 s6, v6 ; SI-NEXT: v_readfirstlane_b32 s7, v7 -; SI-NEXT: v_readfirstlane_b32 s6, v8 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v9 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v9 +; SI-NEXT: v_writelane_b32 v24, s35, 3 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s24 +; SI-NEXT: s_lshr_b32 s88, s5, 16 +; SI-NEXT: s_lshr_b32 s89, s7, 16 +; SI-NEXT: s_lshr_b32 s90, s9, 16 +; SI-NEXT: s_lshr_b32 s91, s11, 16 +; SI-NEXT: s_lshr_b32 s92, s13, 16 +; SI-NEXT: s_lshr_b32 s93, s15, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 16 +; SI-NEXT: s_lshr_b32 s95, s19, 16 +; SI-NEXT: s_lshr_b32 s30, s21, 16 +; SI-NEXT: s_lshr_b32 s31, s23, 16 +; SI-NEXT: s_lshr_b32 s34, s25, 16 +; SI-NEXT: s_lshr_b32 s35, s41, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[40:41], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 @@ -7011,232 +6721,167 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_lshr_b32 s5, s25, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s44, s23, 16 -; SI-NEXT: s_lshr_b32 s45, s22, 16 -; SI-NEXT: s_lshr_b32 s46, s21, 16 -; SI-NEXT: s_lshr_b32 s47, s20, 16 -; SI-NEXT: s_lshr_b32 s56, s19, 16 -; SI-NEXT: s_lshr_b32 s57, s18, 16 -; SI-NEXT: s_lshr_b32 s58, s17, 16 -; SI-NEXT: s_lshr_b32 s59, s16, 16 -; SI-NEXT: s_lshr_b32 s60, s15, 16 -; SI-NEXT: s_lshr_b32 s61, s14, 16 -; SI-NEXT: s_lshr_b32 s62, s13, 16 -; SI-NEXT: s_lshr_b32 s63, s12, 16 -; SI-NEXT: s_lshr_b32 s72, s11, 16 -; SI-NEXT: s_lshr_b32 s73, s10, 16 -; SI-NEXT: s_lshr_b32 s74, s8, 16 -; SI-NEXT: s_lshr_b32 s75, s7, 16 -; SI-NEXT: s_lshr_b32 s76, s6, 16 -; SI-NEXT: s_lshr_b32 s77, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[18:19], 16 +; SI-NEXT: s_lshr_b32 s88, s5, 16 +; SI-NEXT: s_lshr_b32 s89, s7, 16 +; SI-NEXT: s_lshr_b32 s90, s9, 16 +; SI-NEXT: s_lshr_b32 s91, s11, 16 +; SI-NEXT: s_lshr_b32 s92, s13, 16 +; SI-NEXT: s_lshr_b32 s93, s15, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 16 +; SI-NEXT: s_lshr_b32 s95, s19, 16 +; SI-NEXT: s_lshr_b32 s30, s21, 16 +; SI-NEXT: s_lshr_b32 s31, s23, 16 +; SI-NEXT: s_lshr_b32 s34, s25, 16 +; SI-NEXT: s_lshr_b32 s35, s41, 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[40:41], 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v2, v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v52 -; SI-NEXT: v_or_b32_e32 v5, v5, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v50 -; SI-NEXT: v_or_b32_e32 v7, v7, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 -; SI-NEXT: v_or_b32_e32 v9, v38, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 -; SI-NEXT: v_or_b32_e32 v11, v36, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 -; SI-NEXT: v_or_b32_e32 v13, v34, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 -; SI-NEXT: v_or_b32_e32 v15, v32, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v32 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 -; SI-NEXT: v_or_b32_e32 v19, v28, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 -; SI-NEXT: v_or_b32_e32 v21, v26, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v26 -; SI-NEXT: v_or_b32_e32 v3, v54, v3 -; SI-NEXT: v_or_b32_e32 v4, v51, v4 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v8, v39, v8 -; SI-NEXT: v_or_b32_e32 v10, v37, v10 -; SI-NEXT: v_or_b32_e32 v12, v35, v12 -; SI-NEXT: v_or_b32_e32 v14, v33, v14 -; SI-NEXT: v_or_b32_e32 v16, v31, v16 -; SI-NEXT: v_or_b32_e32 v18, v29, v18 -; SI-NEXT: v_or_b32_e32 v20, v27, v20 -; SI-NEXT: v_or_b32_e32 v22, v25, v22 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_lshl_b32 s27, s76, 16 +; SI-NEXT: s_and_b32 s29, s40, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s41, 0xffff +; SI-NEXT: s_lshl_b32 s40, s35, 16 +; SI-NEXT: s_or_b32 s29, s29, s40 +; SI-NEXT: s_lshl_b32 s40, s74, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s40 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s40, s34, 16 +; SI-NEXT: s_or_b32 s25, s25, s40 +; SI-NEXT: s_lshl_b32 s40, s72, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s40 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s40, s31, 16 +; SI-NEXT: s_or_b32 s23, s23, s40 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s40, s62, 16 +; SI-NEXT: s_or_b32 s20, s20, s40 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s40, s30, 16 +; SI-NEXT: s_or_b32 s21, s21, s40 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s40, s60, 16 +; SI-NEXT: s_or_b32 s18, s18, s40 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s40, s95, 16 +; SI-NEXT: s_or_b32 s19, s19, s40 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s40, s58, 16 +; SI-NEXT: s_or_b32 s16, s16, s40 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s40, s94, 16 +; SI-NEXT: s_or_b32 s17, s17, s40 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s40, s56, 16 +; SI-NEXT: s_or_b32 s14, s14, s40 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s40, s93, 16 +; SI-NEXT: s_or_b32 s15, s15, s40 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s40, s46, 16 +; SI-NEXT: s_or_b32 s12, s12, s40 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s40, s92, 16 +; SI-NEXT: s_or_b32 s13, s13, s40 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s40, s44, 16 +; SI-NEXT: s_or_b32 s10, s10, s40 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s40, s91, 16 +; SI-NEXT: s_or_b32 s11, s11, s40 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s40, s42, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s40 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s40, s90, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s89, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s88, 16 +; SI-NEXT: s_or_b32 s9, s9, s40 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: v_mov_b32_e32 v3, s25 +; SI-NEXT: v_mov_b32_e32 v4, s22 +; SI-NEXT: v_mov_b32_e32 v5, s23 +; SI-NEXT: v_mov_b32_e32 v6, s20 +; SI-NEXT: v_mov_b32_e32 v7, s21 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v12, s14 +; SI-NEXT: v_mov_b32_e32 v13, s15 +; SI-NEXT: v_mov_b32_e32 v14, s12 +; SI-NEXT: v_mov_b32_e32 v15, s13 +; SI-NEXT: v_mov_b32_e32 v16, s10 +; SI-NEXT: v_mov_b32_e32 v17, s11 +; SI-NEXT: v_mov_b32_e32 v18, s8 +; SI-NEXT: v_mov_b32_e32 v19, s9 +; SI-NEXT: v_mov_b32_e32 v20, s6 +; SI-NEXT: v_mov_b32_e32 v21, s7 +; SI-NEXT: v_mov_b32_e32 v22, s4 +; SI-NEXT: v_mov_b32_e32 v23, s5 +; SI-NEXT: v_readlane_b32 s35, v24, 3 +; SI-NEXT: v_readlane_b32 s34, v24, 2 +; SI-NEXT: v_readlane_b32 s31, v24, 1 +; SI-NEXT: v_readlane_b32 s30, v24, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v24i32_to_v48f16_scalar: @@ -7881,7 +7526,6 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v24i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -7898,164 +7542,220 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v12 +; SI-NEXT: v_mov_b32_e32 v52, v11 +; SI-NEXT: v_mov_b32_e32 v53, v10 +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v40, v7 +; SI-NEXT: v_mov_b32_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v42, v5 +; SI-NEXT: v_mov_b32_e32 v43, v4 +; SI-NEXT: v_mov_b32_e32 v44, v3 +; SI-NEXT: v_mov_b32_e32 v45, v2 +; SI-NEXT: v_mov_b32_e32 v46, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v47 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v23 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB18_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v57 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v63 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v51 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -8083,124 +7783,20 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v43 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v50, v2 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 -; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v7, v32, v7 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v60, v9 -; SI-NEXT: v_or_b32_e32 v10, v58, v10 -; SI-NEXT: v_or_b32_e32 v11, v56, v11 -; SI-NEXT: v_or_b32_e32 v12, v46, v12 -; SI-NEXT: v_or_b32_e32 v13, v44, v13 -; SI-NEXT: v_or_b32_e32 v14, v42, v14 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v40, v23 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: .LBB18_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -8213,10 +7809,10 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v43 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -8225,194 +7821,198 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v40 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v58 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v38 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v59 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 @@ -9085,536 +8685,341 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; SI-LABEL: bitcast_v48f16_to_v24i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v23 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v37, v4 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_mov_b32_e32 v49, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v48 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v49 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 -; SI-NEXT: v_or_b32_e32 v3, v25, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 -; SI-NEXT: v_or_b32_e32 v2, v47, v2 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v54, v7 -; SI-NEXT: v_or_b32_e32 v8, v53, v8 -; SI-NEXT: v_or_b32_e32 v9, v50, v9 -; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v11, v38, v11 -; SI-NEXT: v_or_b32_e32 v12, v37, v12 -; SI-NEXT: v_or_b32_e32 v13, v35, v13 -; SI-NEXT: v_or_b32_e32 v14, v33, v14 -; SI-NEXT: v_or_b32_e32 v15, v26, v15 -; SI-NEXT: v_or_b32_e32 v16, v31, v16 -; SI-NEXT: v_or_b32_e32 v17, v29, v17 -; SI-NEXT: v_or_b32_e32 v18, v27, v18 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: s_cbranch_execnz .LBB19_3 -; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s40 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s15 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v41 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v40 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v38 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: .LBB19_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v39 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v53, v37 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v52, v36 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v51, v35 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v49, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v48, v32 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v36, v59 -; SI-NEXT: v_mov_b32_e32 v59, v28 -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v61, v30 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v24 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v24, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v31, v62 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v30, v61 -; SI-NEXT: v_mov_b32_e32 v61, v34 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: v_mov_b32_e32 v28, v59 -; SI-NEXT: v_mov_b32_e32 v59, v36 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v26, v39 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_mov_b32_e32 v33, v49 -; SI-NEXT: v_mov_b32_e32 v34, v50 -; SI-NEXT: v_mov_b32_e32 v35, v51 -; SI-NEXT: v_mov_b32_e32 v36, v52 -; SI-NEXT: v_mov_b32_e32 v37, v53 -; SI-NEXT: v_mov_b32_e32 v39, v55 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v48f16_to_v24i32_scalar: @@ -15279,433 +14684,186 @@ end: define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v24f32_to_v48f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v36, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v49, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_mov_b32_e32 v33, v22 -; SI-NEXT: v_mov_b32_e32 v32, v23 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v36, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v49, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v61 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v57 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v45 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v46 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v53 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v54 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v34 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v32 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v39 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v33 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v38 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v54 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v52 +; SI-NEXT: v_or_b32_e32 v8, v8, v31 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v10, v10, v30 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v50 +; SI-NEXT: v_or_b32_e32 v12, v12, v29 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v14, v14, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v16, v16, v27 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_or_b32_e32 v18, v18, v26 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v20, v20, v25 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v29 +; SI-NEXT: v_or_b32_e32 v15, v15, v28 +; SI-NEXT: v_or_b32_e32 v17, v17, v27 +; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f32_to_v48f16: @@ -16178,423 +15336,228 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v18, s16 -; SI-NEXT: v_mov_b32_e32 v25, s17 -; SI-NEXT: v_mov_b32_e32 v24, s18 -; SI-NEXT: v_mov_b32_e32 v23, s19 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v62, s20 -; SI-NEXT: v_mov_b32_e32 v60, s21 -; SI-NEXT: v_mov_b32_e32 v58, s22 -; SI-NEXT: v_mov_b32_e32 v57, s23 -; SI-NEXT: v_mov_b32_e32 v56, s24 -; SI-NEXT: v_mov_b32_e32 v20, s25 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v21, s26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, s27 -; SI-NEXT: v_mov_b32_e32 v61, s28 -; SI-NEXT: v_mov_b32_e32 v59, s29 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v58 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v25 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v21 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v5 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v29, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v25 +; SI-NEXT: v_lshr_b64 v[34:35], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v11 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshr_b64 v[38:39], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v63 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v56 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v62 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 +; SI-NEXT: v_lshr_b64 v[34:35], v[8:9], 16 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v35, 1.0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v60 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v58 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v57 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[35:36], v[6:7], 16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[36:37], v[4:5], 16 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v34, 1.0, v59 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v18 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[37:38], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v11 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 ; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v30 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v30, v22, v25 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v22, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; SI-NEXT: v_or_b32_e32 v32, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; SI-NEXT: v_or_b32_e32 v33, v20, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 +; SI-NEXT: v_or_b32_e32 v25, v18, v19 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v14, v18 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v43 +; SI-NEXT: v_or_b32_e32 v27, v14, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_or_b32_e32 v28, v14, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v42 +; SI-NEXT: v_or_b32_e32 v29, v14, v15 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v46 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v43 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v40 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v37 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v52 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v9, v19, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v48 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v10, v18, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v12 -; SI-NEXT: v_or_b32_e32 v11, v20, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v32 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 -; SI-NEXT: v_or_b32_e32 v15, v20, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v30 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v20, v17 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v28 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v27 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v26 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v30 +; SI-NEXT: v_mov_b32_e32 v1, v31 +; SI-NEXT: v_mov_b32_e32 v2, v32 +; SI-NEXT: v_mov_b32_e32 v3, v33 +; SI-NEXT: v_mov_b32_e32 v4, v24 +; SI-NEXT: v_mov_b32_e32 v5, v25 +; SI-NEXT: v_mov_b32_e32 v6, v26 +; SI-NEXT: v_mov_b32_e32 v7, v27 +; SI-NEXT: v_mov_b32_e32 v8, v28 +; SI-NEXT: v_mov_b32_e32 v9, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v24f32_to_v48f16_scalar: @@ -17293,7 +16256,6 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v24f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -17310,164 +16272,220 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v12 +; SI-NEXT: v_mov_b32_e32 v52, v11 +; SI-NEXT: v_mov_b32_e32 v53, v10 +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v40, v7 +; SI-NEXT: v_mov_b32_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v42, v5 +; SI-NEXT: v_mov_b32_e32 v43, v4 +; SI-NEXT: v_mov_b32_e32 v44, v3 +; SI-NEXT: v_mov_b32_e32 v45, v2 +; SI-NEXT: v_mov_b32_e32 v46, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v23 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v47 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v57 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v63 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v51 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -17495,124 +16513,20 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v43 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v50, v2 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 -; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v7, v32, v7 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v60, v9 -; SI-NEXT: v_or_b32_e32 v10, v58, v10 -; SI-NEXT: v_or_b32_e32 v11, v56, v11 -; SI-NEXT: v_or_b32_e32 v12, v46, v12 -; SI-NEXT: v_or_b32_e32 v13, v44, v13 -; SI-NEXT: v_or_b32_e32 v14, v42, v14 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v40, v23 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: .LBB34_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -17625,10 +16539,10 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v43 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -17637,194 +16551,198 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v40 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v58 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v38 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v59 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 @@ -18497,536 +17415,341 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; SI-LABEL: bitcast_v48f16_to_v24f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v23 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v37, v4 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_mov_b32_e32 v49, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v48 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v49 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 -; SI-NEXT: v_or_b32_e32 v3, v25, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 -; SI-NEXT: v_or_b32_e32 v2, v47, v2 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v54, v7 -; SI-NEXT: v_or_b32_e32 v8, v53, v8 -; SI-NEXT: v_or_b32_e32 v9, v50, v9 -; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v11, v38, v11 -; SI-NEXT: v_or_b32_e32 v12, v37, v12 -; SI-NEXT: v_or_b32_e32 v13, v35, v13 -; SI-NEXT: v_or_b32_e32 v14, v33, v14 -; SI-NEXT: v_or_b32_e32 v15, v26, v15 -; SI-NEXT: v_or_b32_e32 v16, v31, v16 -; SI-NEXT: v_or_b32_e32 v17, v29, v17 -; SI-NEXT: v_or_b32_e32 v18, v27, v18 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s40 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s15 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v41 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v40 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v38 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: .LBB35_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v39 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v53, v37 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v52, v36 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v51, v35 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v49, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v48, v32 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v36, v59 -; SI-NEXT: v_mov_b32_e32 v59, v28 -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v61, v30 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v24 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v24, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v31, v62 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v30, v61 -; SI-NEXT: v_mov_b32_e32 v61, v34 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: v_mov_b32_e32 v28, v59 -; SI-NEXT: v_mov_b32_e32 v59, v36 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v26, v39 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_mov_b32_e32 v33, v49 -; SI-NEXT: v_mov_b32_e32 v34, v50 -; SI-NEXT: v_mov_b32_e32 v35, v51 -; SI-NEXT: v_mov_b32_e32 v36, v52 -; SI-NEXT: v_mov_b32_e32 v37, v53 -; SI-NEXT: v_mov_b32_e32 v39, v55 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v48f16_to_v24f32_scalar: @@ -23907,190 +22630,60 @@ end: define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v12i64_to_v48f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v32, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v35, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v49, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 @@ -24111,228 +22704,112 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 ; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_mov_b32_e32 v33, v22 -; SI-NEXT: v_mov_b32_e32 v32, v23 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v32, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v35, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v38, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v49, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v61 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v57 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v45 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v46 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v53 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v54 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v34 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v32 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v39 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v33 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v38 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v54 +; SI-NEXT: v_or_b32_e32 v4, v4, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v8, v8, v31 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v10, v10, v30 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v50 +; SI-NEXT: v_or_b32_e32 v12, v12, v29 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v14, v14, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v16, v16, v27 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_or_b32_e32 v18, v18, v26 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v20, v20, v25 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v29 +; SI-NEXT: v_or_b32_e32 v15, v15, v28 +; SI-NEXT: v_or_b32_e32 v17, v17, v27 +; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i64_to_v48f16: @@ -24840,369 +23317,261 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i ; SI-LABEL: bitcast_v12i64_to_v48f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v24, s30, 0 ; SI-NEXT: v_mov_b32_e32 v11, s16 ; SI-NEXT: v_mov_b32_e32 v12, s17 ; SI-NEXT: v_mov_b32_e32 v13, s18 ; SI-NEXT: v_mov_b32_e32 v14, s19 ; SI-NEXT: v_mov_b32_e32 v15, s20 +; SI-NEXT: v_writelane_b32 v24, s31, 1 ; SI-NEXT: v_mov_b32_e32 v16, s21 ; SI-NEXT: v_mov_b32_e32 v17, s22 ; SI-NEXT: v_mov_b32_e32 v18, s23 ; SI-NEXT: v_mov_b32_e32 v19, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v11 +; SI-NEXT: v_readfirstlane_b32 s40, v11 ; SI-NEXT: v_mov_b32_e32 v11, s25 -; SI-NEXT: v_readfirstlane_b32 s40, v12 +; SI-NEXT: v_readfirstlane_b32 s41, v12 ; SI-NEXT: v_mov_b32_e32 v12, s26 -; SI-NEXT: v_readfirstlane_b32 s25, v13 +; SI-NEXT: v_readfirstlane_b32 s24, v13 ; SI-NEXT: v_mov_b32_e32 v13, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v14 +; SI-NEXT: v_readfirstlane_b32 s25, v14 ; SI-NEXT: v_mov_b32_e32 v14, s28 -; SI-NEXT: v_readfirstlane_b32 s26, v15 +; SI-NEXT: v_readfirstlane_b32 s22, v15 ; SI-NEXT: v_mov_b32_e32 v15, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: v_readfirstlane_b32 s28, v16 -; SI-NEXT: v_readfirstlane_b32 s22, v17 -; SI-NEXT: v_readfirstlane_b32 s23, v18 -; SI-NEXT: v_readfirstlane_b32 s20, v19 -; SI-NEXT: v_readfirstlane_b32 s21, v11 -; SI-NEXT: v_readfirstlane_b32 s18, v12 -; SI-NEXT: v_readfirstlane_b32 s19, v13 -; SI-NEXT: v_readfirstlane_b32 s16, v14 -; SI-NEXT: v_readfirstlane_b32 s17, v15 -; SI-NEXT: v_readfirstlane_b32 s14, v0 -; SI-NEXT: v_readfirstlane_b32 s15, v1 -; SI-NEXT: v_readfirstlane_b32 s12, v2 -; SI-NEXT: v_readfirstlane_b32 s13, v3 -; SI-NEXT: v_readfirstlane_b32 s10, v4 -; SI-NEXT: v_readfirstlane_b32 s11, v5 -; SI-NEXT: v_readfirstlane_b32 s7, v6 -; SI-NEXT: v_readfirstlane_b32 s8, v7 -; SI-NEXT: v_readfirstlane_b32 s6, v8 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v9 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v24, s34, 2 +; SI-NEXT: v_readfirstlane_b32 s23, v16 +; SI-NEXT: v_readfirstlane_b32 s20, v17 +; SI-NEXT: v_readfirstlane_b32 s21, v18 +; SI-NEXT: v_readfirstlane_b32 s18, v19 +; SI-NEXT: v_readfirstlane_b32 s19, v11 +; SI-NEXT: v_readfirstlane_b32 s16, v12 +; SI-NEXT: v_readfirstlane_b32 s17, v13 +; SI-NEXT: v_readfirstlane_b32 s14, v14 +; SI-NEXT: v_readfirstlane_b32 s15, v15 +; SI-NEXT: v_readfirstlane_b32 s12, v0 +; SI-NEXT: v_readfirstlane_b32 s13, v1 +; SI-NEXT: v_readfirstlane_b32 s10, v2 +; SI-NEXT: v_readfirstlane_b32 s11, v3 +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_readfirstlane_b32 s9, v5 +; SI-NEXT: v_readfirstlane_b32 s6, v6 +; SI-NEXT: v_readfirstlane_b32 s7, v7 +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v9 +; SI-NEXT: v_writelane_b32 v24, s35, 3 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s24 +; SI-NEXT: s_lshr_b32 s88, s5, 16 +; SI-NEXT: s_lshr_b32 s89, s7, 16 +; SI-NEXT: s_lshr_b32 s90, s9, 16 +; SI-NEXT: s_lshr_b32 s91, s11, 16 +; SI-NEXT: s_lshr_b32 s92, s13, 16 +; SI-NEXT: s_lshr_b32 s93, s15, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 16 +; SI-NEXT: s_lshr_b32 s95, s19, 16 +; SI-NEXT: s_lshr_b32 s30, s21, 16 +; SI-NEXT: s_lshr_b32 s31, s23, 16 +; SI-NEXT: s_lshr_b32 s34, s25, 16 +; SI-NEXT: s_lshr_b32 s35, s41, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[40:41], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s24, 3 -; SI-NEXT: s_addc_u32 s5, s40, 0 -; SI-NEXT: s_lshr_b32 s24, s4, 16 -; SI-NEXT: s_lshr_b32 s29, s5, 16 -; SI-NEXT: s_add_u32 s25, s25, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s40, s25, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s28, s28, 0 -; SI-NEXT: s_lshr_b32 s42, s26, 16 -; SI-NEXT: s_lshr_b32 s43, s28, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s44, s22, 16 -; SI-NEXT: s_lshr_b32 s45, s23, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s46, s20, 16 -; SI-NEXT: s_lshr_b32 s47, s21, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s56, s18, 16 -; SI-NEXT: s_lshr_b32 s57, s19, 16 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s58, s16, 16 -; SI-NEXT: s_lshr_b32 s59, s17, 16 -; SI-NEXT: s_add_u32 s14, s14, 3 -; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_lshr_b32 s60, s14, 16 -; SI-NEXT: s_lshr_b32 s61, s15, 16 -; SI-NEXT: s_add_u32 s12, s12, 3 -; SI-NEXT: s_addc_u32 s13, s13, 0 -; SI-NEXT: s_lshr_b32 s62, s12, 16 -; SI-NEXT: s_lshr_b32 s63, s13, 16 -; SI-NEXT: s_add_u32 s10, s10, 3 -; SI-NEXT: s_addc_u32 s11, s11, 0 -; SI-NEXT: s_lshr_b32 s72, s10, 16 -; SI-NEXT: s_lshr_b32 s73, s11, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_lshr_b32 s74, s7, 16 -; SI-NEXT: s_lshr_b32 s75, s8, 16 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_lshr_b32 s76, s6, 16 -; SI-NEXT: s_lshr_b32 s77, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s24 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_lshr_b32 s88, s5, 16 +; SI-NEXT: s_lshr_b32 s89, s7, 16 +; SI-NEXT: s_lshr_b32 s90, s9, 16 +; SI-NEXT: s_lshr_b32 s91, s11, 16 +; SI-NEXT: s_lshr_b32 s92, s13, 16 +; SI-NEXT: s_lshr_b32 s93, s15, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 16 +; SI-NEXT: s_lshr_b32 s95, s19, 16 +; SI-NEXT: s_lshr_b32 s30, s21, 16 +; SI-NEXT: s_lshr_b32 s31, s23, 16 +; SI-NEXT: s_lshr_b32 s34, s25, 16 +; SI-NEXT: s_lshr_b32 s35, s41, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[40:41], 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v2, v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v52 -; SI-NEXT: v_or_b32_e32 v5, v5, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v50 -; SI-NEXT: v_or_b32_e32 v7, v7, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 -; SI-NEXT: v_or_b32_e32 v9, v38, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 -; SI-NEXT: v_or_b32_e32 v11, v36, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 -; SI-NEXT: v_or_b32_e32 v13, v34, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 -; SI-NEXT: v_or_b32_e32 v15, v32, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v32 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 -; SI-NEXT: v_or_b32_e32 v19, v28, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 -; SI-NEXT: v_or_b32_e32 v21, v26, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v26 -; SI-NEXT: v_or_b32_e32 v3, v54, v3 -; SI-NEXT: v_or_b32_e32 v4, v51, v4 -; SI-NEXT: v_or_b32_e32 v6, v49, v6 -; SI-NEXT: v_or_b32_e32 v8, v39, v8 -; SI-NEXT: v_or_b32_e32 v10, v37, v10 -; SI-NEXT: v_or_b32_e32 v12, v35, v12 -; SI-NEXT: v_or_b32_e32 v14, v33, v14 -; SI-NEXT: v_or_b32_e32 v16, v31, v16 -; SI-NEXT: v_or_b32_e32 v18, v29, v18 -; SI-NEXT: v_or_b32_e32 v20, v27, v20 -; SI-NEXT: v_or_b32_e32 v22, v25, v22 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: s_lshl_b32 s27, s76, 16 +; SI-NEXT: s_and_b32 s29, s40, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s41, 0xffff +; SI-NEXT: s_lshl_b32 s40, s35, 16 +; SI-NEXT: s_or_b32 s29, s29, s40 +; SI-NEXT: s_lshl_b32 s40, s74, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s40 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s40, s34, 16 +; SI-NEXT: s_or_b32 s25, s25, s40 +; SI-NEXT: s_lshl_b32 s40, s72, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s40 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s40, s31, 16 +; SI-NEXT: s_or_b32 s23, s23, s40 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s40, s62, 16 +; SI-NEXT: s_or_b32 s20, s20, s40 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s40, s30, 16 +; SI-NEXT: s_or_b32 s21, s21, s40 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s40, s60, 16 +; SI-NEXT: s_or_b32 s18, s18, s40 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s40, s95, 16 +; SI-NEXT: s_or_b32 s19, s19, s40 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s40, s58, 16 +; SI-NEXT: s_or_b32 s16, s16, s40 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s40, s94, 16 +; SI-NEXT: s_or_b32 s17, s17, s40 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s40, s56, 16 +; SI-NEXT: s_or_b32 s14, s14, s40 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s40, s93, 16 +; SI-NEXT: s_or_b32 s15, s15, s40 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s40, s46, 16 +; SI-NEXT: s_or_b32 s12, s12, s40 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s40, s92, 16 +; SI-NEXT: s_or_b32 s13, s13, s40 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s40, s44, 16 +; SI-NEXT: s_or_b32 s10, s10, s40 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s40, s91, 16 +; SI-NEXT: s_or_b32 s11, s11, s40 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s40, s42, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s40 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s40, s90, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s89, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s88, 16 +; SI-NEXT: s_or_b32 s9, s9, s40 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: v_mov_b32_e32 v3, s25 +; SI-NEXT: v_mov_b32_e32 v4, s22 +; SI-NEXT: v_mov_b32_e32 v5, s23 +; SI-NEXT: v_mov_b32_e32 v6, s20 +; SI-NEXT: v_mov_b32_e32 v7, s21 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v12, s14 +; SI-NEXT: v_mov_b32_e32 v13, s15 +; SI-NEXT: v_mov_b32_e32 v14, s12 +; SI-NEXT: v_mov_b32_e32 v15, s13 +; SI-NEXT: v_mov_b32_e32 v16, s10 +; SI-NEXT: v_mov_b32_e32 v17, s11 +; SI-NEXT: v_mov_b32_e32 v18, s8 +; SI-NEXT: v_mov_b32_e32 v19, s9 +; SI-NEXT: v_mov_b32_e32 v20, s6 +; SI-NEXT: v_mov_b32_e32 v21, s7 +; SI-NEXT: v_mov_b32_e32 v22, s4 +; SI-NEXT: v_mov_b32_e32 v23, s5 +; SI-NEXT: v_readlane_b32 s35, v24, 3 +; SI-NEXT: v_readlane_b32 s34, v24, 2 +; SI-NEXT: v_readlane_b32 s31, v24, 1 +; SI-NEXT: v_readlane_b32 s30, v24, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v12i64_to_v48f16_scalar: @@ -25847,7 +24216,6 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v12i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -25864,164 +24232,220 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v12 +; SI-NEXT: v_mov_b32_e32 v52, v11 +; SI-NEXT: v_mov_b32_e32 v53, v10 +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v40, v7 +; SI-NEXT: v_mov_b32_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v42, v5 +; SI-NEXT: v_mov_b32_e32 v43, v4 +; SI-NEXT: v_mov_b32_e32 v44, v3 +; SI-NEXT: v_mov_b32_e32 v45, v2 +; SI-NEXT: v_mov_b32_e32 v46, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v23 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v47 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v57 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v63 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v51 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -26049,124 +24473,20 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v43 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v50, v2 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 -; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v7, v32, v7 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v60, v9 -; SI-NEXT: v_or_b32_e32 v10, v58, v10 -; SI-NEXT: v_or_b32_e32 v11, v56, v11 -; SI-NEXT: v_or_b32_e32 v12, v46, v12 -; SI-NEXT: v_or_b32_e32 v13, v44, v13 -; SI-NEXT: v_or_b32_e32 v14, v42, v14 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v40, v23 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -26179,10 +24499,10 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v43 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -26191,194 +24511,198 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v40 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v58 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v38 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v59 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 @@ -27051,536 +25375,341 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; SI-LABEL: bitcast_v48f16_to_v12i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v23 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v37, v4 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_mov_b32_e32 v49, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v48 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v49 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 -; SI-NEXT: v_or_b32_e32 v3, v25, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 -; SI-NEXT: v_or_b32_e32 v2, v47, v2 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v54, v7 -; SI-NEXT: v_or_b32_e32 v8, v53, v8 -; SI-NEXT: v_or_b32_e32 v9, v50, v9 -; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v11, v38, v11 -; SI-NEXT: v_or_b32_e32 v12, v37, v12 -; SI-NEXT: v_or_b32_e32 v13, v35, v13 -; SI-NEXT: v_or_b32_e32 v14, v33, v14 -; SI-NEXT: v_or_b32_e32 v15, v26, v15 -; SI-NEXT: v_or_b32_e32 v16, v31, v16 -; SI-NEXT: v_or_b32_e32 v17, v29, v17 -; SI-NEXT: v_or_b32_e32 v18, v27, v18 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s40 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s15 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v41 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v40 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v38 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v39 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v53, v37 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v52, v36 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v51, v35 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v49, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v48, v32 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v36, v59 -; SI-NEXT: v_mov_b32_e32 v59, v28 -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v61, v30 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v24 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v24, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v31, v62 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v30, v61 -; SI-NEXT: v_mov_b32_e32 v61, v34 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: v_mov_b32_e32 v28, v59 -; SI-NEXT: v_mov_b32_e32 v59, v36 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v26, v39 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_mov_b32_e32 v33, v49 -; SI-NEXT: v_mov_b32_e32 v34, v50 -; SI-NEXT: v_mov_b32_e32 v35, v51 -; SI-NEXT: v_mov_b32_e32 v36, v52 -; SI-NEXT: v_mov_b32_e32 v37, v53 -; SI-NEXT: v_mov_b32_e32 v39, v55 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v48f16_to_v12i64_scalar: @@ -31613,189 +29742,65 @@ end: define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v12f64_to_v48f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v36, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v39, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v50, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 @@ -31803,219 +29808,108 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_alignbit_b32 v24, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v25, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v26, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v28, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v31, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v36, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v39, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v50, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_mov_b32_e32 v33, v22 -; SI-NEXT: v_mov_b32_e32 v32, v23 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v61 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v57 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v45 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v46 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v42 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v53 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v54 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v34 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v32 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v39 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v33 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v52 +; SI-NEXT: v_or_b32_e32 v8, v8, v31 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v10, v10, v30 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v12, v12, v29 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v14, v14, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v16, v16, v27 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_or_b32_e32 v18, v18, v26 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v20, v20, v25 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v50 +; SI-NEXT: v_or_b32_e32 v3, v3, v39 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v29 +; SI-NEXT: v_or_b32_e32 v15, v15, v28 +; SI-NEXT: v_or_b32_e32 v17, v17, v27 +; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f64_to_v48f16: @@ -32474,404 +30368,206 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_mov_b32_e32 v15, s23 ; SI-NEXT: v_mov_b32_e32 v16, s24 ; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_mov_b32_e32 v12, s26 -; SI-NEXT: v_mov_b32_e32 v13, s27 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v10, s28 -; SI-NEXT: v_mov_b32_e32 v11, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v7 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v9 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v50, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v14 +; SI-NEXT: v_lshr_b64 v[34:35], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v11 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshr_b64 v[38:39], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[34:35], v[8:9], 16 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_lshr_b64 v[35:36], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_lshr_b64 v[36:37], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v11 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v16 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_mov_b32_e32 v33, v8 -; SI-NEXT: v_mov_b32_e32 v32, v9 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v25 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v45 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v62 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v43 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v58 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v46 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v53 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v54 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v34 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v37 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v32 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v30 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v30, v22, v25 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v22, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; SI-NEXT: v_or_b32_e32 v32, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; SI-NEXT: v_or_b32_e32 v33, v20, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 +; SI-NEXT: v_or_b32_e32 v25, v18, v19 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v14, v18 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v43 +; SI-NEXT: v_or_b32_e32 v27, v14, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_or_b32_e32 v28, v14, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v42 +; SI-NEXT: v_or_b32_e32 v29, v14, v15 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v30 +; SI-NEXT: v_mov_b32_e32 v1, v31 +; SI-NEXT: v_mov_b32_e32 v2, v32 +; SI-NEXT: v_mov_b32_e32 v3, v33 +; SI-NEXT: v_mov_b32_e32 v4, v24 +; SI-NEXT: v_mov_b32_e32 v5, v25 +; SI-NEXT: v_mov_b32_e32 v6, v26 +; SI-NEXT: v_mov_b32_e32 v7, v27 +; SI-NEXT: v_mov_b32_e32 v8, v28 +; SI-NEXT: v_mov_b32_e32 v9, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v39 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v33 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v12f64_to_v48f16_scalar: @@ -33546,7 +31242,6 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v12f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -33563,164 +31258,220 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_mov_b32_e32 v38, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_mov_b32_e32 v39, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v12 +; SI-NEXT: v_mov_b32_e32 v52, v11 +; SI-NEXT: v_mov_b32_e32 v53, v10 +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v40, v7 +; SI-NEXT: v_mov_b32_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v42, v5 +; SI-NEXT: v_mov_b32_e32 v43, v4 +; SI-NEXT: v_mov_b32_e32 v44, v3 +; SI-NEXT: v_mov_b32_e32 v45, v2 +; SI-NEXT: v_mov_b32_e32 v46, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v23 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v47 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v41 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v57 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v63 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v51 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -33748,124 +31499,20 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v43 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_or_b32_e32 v2, v50, v2 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_or_b32_e32 v4, v38, v4 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 -; SI-NEXT: v_or_b32_e32 v6, v34, v6 -; SI-NEXT: v_or_b32_e32 v7, v32, v7 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 -; SI-NEXT: v_or_b32_e32 v9, v60, v9 -; SI-NEXT: v_or_b32_e32 v10, v58, v10 -; SI-NEXT: v_or_b32_e32 v11, v56, v11 -; SI-NEXT: v_or_b32_e32 v12, v46, v12 -; SI-NEXT: v_or_b32_e32 v13, v44, v13 -; SI-NEXT: v_or_b32_e32 v14, v42, v14 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v40, v23 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: .LBB54_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -33878,10 +31525,10 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v43 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -33890,194 +31537,198 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v40 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v58 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v38 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v59 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 @@ -34750,536 +32401,341 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; SI-LABEL: bitcast_v48f16_to_v12f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v23 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v37, v4 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_mov_b32_e32 v49, v0 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v48 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v49 ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 -; SI-NEXT: v_or_b32_e32 v3, v25, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 -; SI-NEXT: v_or_b32_e32 v2, v47, v2 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: v_or_b32_e32 v7, v54, v7 -; SI-NEXT: v_or_b32_e32 v8, v53, v8 -; SI-NEXT: v_or_b32_e32 v9, v50, v9 -; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v11, v38, v11 -; SI-NEXT: v_or_b32_e32 v12, v37, v12 -; SI-NEXT: v_or_b32_e32 v13, v35, v13 -; SI-NEXT: v_or_b32_e32 v14, v33, v14 -; SI-NEXT: v_or_b32_e32 v15, v26, v15 -; SI-NEXT: v_or_b32_e32 v16, v31, v16 -; SI-NEXT: v_or_b32_e32 v17, v29, v17 -; SI-NEXT: v_or_b32_e32 v18, v27, v18 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s40 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s15 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s11 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s25 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s28 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v41 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v40 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v38 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v39 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v53, v37 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v52, v36 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v51, v35 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v49, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v48, v32 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v36, v59 -; SI-NEXT: v_mov_b32_e32 v59, v28 -; SI-NEXT: v_mov_b32_e32 v35, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v61, v30 -; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v24 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v24, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 -; SI-NEXT: v_mov_b32_e32 v31, v62 -; SI-NEXT: v_mov_b32_e32 v62, v33 -; SI-NEXT: v_mov_b32_e32 v30, v61 -; SI-NEXT: v_mov_b32_e32 v61, v34 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: v_mov_b32_e32 v28, v59 -; SI-NEXT: v_mov_b32_e32 v59, v36 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 -; SI-NEXT: v_mov_b32_e32 v26, v39 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_mov_b32_e32 v33, v49 -; SI-NEXT: v_mov_b32_e32 v34, v50 -; SI-NEXT: v_mov_b32_e32 v35, v51 -; SI-NEXT: v_mov_b32_e32 v36, v52 -; SI-NEXT: v_mov_b32_e32 v37, v53 -; SI-NEXT: v_mov_b32_e32 v39, v55 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v48f16_to_v12f64_scalar: @@ -35874,7 +33330,30 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v48i16_to_v48f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v34 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v32 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -35918,19 +33397,6 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -35947,540 +33413,490 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v0 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB56_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v51 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v54 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v52 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v47 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v31 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v53 +; SI-NEXT: v_or_b32_e32 v46, v1, v24 +; SI-NEXT: v_alignbit_b32 v1, v46, v58, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v44, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v44, v60, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 +; SI-NEXT: v_or_b32_e32 v43, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v43, v61, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_or_b32_e32 v41, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v41, v36, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v55, v1, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v1, v55, v51, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v52, v1, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_alignbit_b32 v1, v52, v40, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v0, v0, v61 +; SI-NEXT: v_or_b32_e32 v49, v1, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_alignbit_b32 v1, v49, v45, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v38, v1, v57 +; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_alignbit_b32 v1, v38, v59, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 +; SI-NEXT: v_or_b32_e32 v35, v1, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_alignbit_b32 v1, v35, v63, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v33, v1, v47 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_alignbit_b32 v1, v33, v48, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v63 +; SI-NEXT: v_or_b32_e32 v31, v1, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v0, v0, v48 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_alignbit_b32 v1, v31, v42, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v24, v1, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_alignbit_b32 v1, v24, v56, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v46 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v56, v22 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v23 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v22, v26, v22 +; SI-NEXT: v_or_b32_e32 v20, v42, v20 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v20, v25, v20 +; SI-NEXT: v_or_b32_e32 v18, v48, v18 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v18, v47, v18 +; SI-NEXT: v_or_b32_e32 v16, v63, v16 +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v16, v54, v16 +; SI-NEXT: v_or_b32_e32 v14, v59, v14 +; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v14, v57, v14 +; SI-NEXT: v_or_b32_e32 v12, v45, v12 +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_or_b32_e32 v10, v40, v10 +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v51, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v36, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v60, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v42 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v41 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v43 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v44 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v0, v46, v2, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v44, v4, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v43, v6, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v41, v8, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v55, v10, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v52, v12, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v49, v14, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v38, v16, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v35, v18, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v33, v20, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v31, v22, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v24, v27, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v31 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v59 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v63 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v57 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -36497,12 +33913,56 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48i16_to_v48f16: @@ -36965,411 +34425,503 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-LABEL: bitcast_v48i16_to_v48f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v24, s30, 0 +; SI-NEXT: v_writelane_b32 v24, s31, 1 +; SI-NEXT: v_writelane_b32 v24, s34, 2 +; SI-NEXT: v_writelane_b32 v24, s35, 3 +; SI-NEXT: v_writelane_b32 v24, s36, 4 +; SI-NEXT: v_writelane_b32 v24, s37, 5 +; SI-NEXT: v_writelane_b32 v24, s38, 6 +; SI-NEXT: v_writelane_b32 v24, s39, 7 +; SI-NEXT: v_writelane_b32 v24, s48, 8 +; SI-NEXT: v_writelane_b32 v24, s49, 9 +; SI-NEXT: v_writelane_b32 v24, s50, 10 +; SI-NEXT: v_writelane_b32 v24, s51, 11 +; SI-NEXT: v_writelane_b32 v24, s52, 12 +; SI-NEXT: v_writelane_b32 v24, s53, 13 +; SI-NEXT: v_writelane_b32 v24, s54, 14 +; SI-NEXT: v_writelane_b32 v24, s55, 15 +; SI-NEXT: v_writelane_b32 v24, s64, 16 +; SI-NEXT: v_writelane_b32 v24, s65, 17 +; SI-NEXT: v_writelane_b32 v24, s66, 18 +; SI-NEXT: v_writelane_b32 v24, s67, 19 +; SI-NEXT: v_writelane_b32 v24, s68, 20 +; SI-NEXT: v_writelane_b32 v24, s69, 21 +; SI-NEXT: v_writelane_b32 v24, s70, 22 +; SI-NEXT: v_writelane_b32 v24, s71, 23 +; SI-NEXT: v_writelane_b32 v24, s80, 24 +; SI-NEXT: v_writelane_b32 v24, s81, 25 +; SI-NEXT: v_writelane_b32 v24, s82, 26 +; SI-NEXT: v_writelane_b32 v24, s83, 27 +; SI-NEXT: v_writelane_b32 v24, s84, 28 +; SI-NEXT: v_writelane_b32 v24, s85, 29 +; SI-NEXT: v_writelane_b32 v24, s86, 30 +; SI-NEXT: v_writelane_b32 v24, s87, 31 +; SI-NEXT: v_writelane_b32 v24, s96, 32 +; SI-NEXT: v_writelane_b32 v24, s97, 33 +; SI-NEXT: v_writelane_b32 v24, s98, 34 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_readfirstlane_b32 s85, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: s_lshr_b32 s50, s29, 16 +; SI-NEXT: s_lshr_b32 s70, s28, 16 +; SI-NEXT: s_lshr_b32 s49, s27, 16 +; SI-NEXT: s_lshr_b32 s69, s26, 16 +; SI-NEXT: s_lshr_b32 s48, s25, 16 +; SI-NEXT: s_lshr_b32 s68, s24, 16 +; SI-NEXT: s_lshr_b32 s39, s23, 16 +; SI-NEXT: s_lshr_b32 s67, s22, 16 +; SI-NEXT: s_lshr_b32 s38, s21, 16 +; SI-NEXT: s_lshr_b32 s66, s20, 16 +; SI-NEXT: s_lshr_b32 s37, s19, 16 +; SI-NEXT: s_lshr_b32 s63, s18, 16 +; SI-NEXT: s_lshr_b32 s36, s17, 16 +; SI-NEXT: s_lshr_b32 s61, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_writelane_b32 v24, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s64, v8 +; SI-NEXT: v_readfirstlane_b32 s87, v7 +; SI-NEXT: v_readfirstlane_b32 s97, v6 +; SI-NEXT: v_readfirstlane_b32 s83, v5 +; SI-NEXT: v_readfirstlane_b32 s86, v4 +; SI-NEXT: v_readfirstlane_b32 s81, v3 +; SI-NEXT: v_readfirstlane_b32 s82, v2 +; SI-NEXT: v_readfirstlane_b32 s71, v1 +; SI-NEXT: v_readfirstlane_b32 s80, v0 +; SI-NEXT: v_readfirstlane_b32 s55, v11 +; SI-NEXT: v_readfirstlane_b32 s65, v12 +; SI-NEXT: v_readfirstlane_b32 s53, v13 +; SI-NEXT: v_readfirstlane_b32 s99, v14 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_readfirstlane_b32 s54, v15 +; SI-NEXT: v_readfirstlane_b32 s98, v16 +; SI-NEXT: v_readfirstlane_b32 s52, v17 +; SI-NEXT: v_readfirstlane_b32 s96, v18 +; SI-NEXT: v_readfirstlane_b32 s51, v19 +; SI-NEXT: v_readfirstlane_b32 s84, v9 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s13 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_mov_b32_e32 v15, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_mov_b32_e32 v63, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 -; SI-NEXT: v_mov_b32_e32 v58, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 -; SI-NEXT: v_mov_b32_e32 v59, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 -; SI-NEXT: v_mov_b32_e32 v60, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 -; SI-NEXT: v_mov_b32_e32 v61, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 -; SI-NEXT: v_mov_b32_e32 v12, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v9 -; SI-NEXT: v_mov_b32_e32 v14, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s36, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s60, s61, 16 +; SI-NEXT: s_mov_b32 s9, s61 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s37, 16 +; SI-NEXT: s_or_b32 s44, s4, s60 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s62, s63, 16 +; SI-NEXT: s_mov_b32 s11, s63 +; SI-NEXT: s_or_b32 s63, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s38, 16 +; SI-NEXT: s_or_b32 s42, s4, s62 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s72, s66, 16 +; SI-NEXT: s_or_b32 s73, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s39, 16 +; SI-NEXT: s_or_b32 s40, s4, s72 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s74, s67, 16 +; SI-NEXT: s_or_b32 s75, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s48, 16 +; SI-NEXT: s_or_b32 s14, s4, s74 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s76, s68, 16 +; SI-NEXT: s_or_b32 s77, s5, s7 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s7, s49, 16 +; SI-NEXT: s_or_b32 s12, s4, s76 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s78, s69, 16 +; SI-NEXT: s_or_b32 s79, s5, s7 +; SI-NEXT: s_and_b32 s5, s29, 0xffff +; SI-NEXT: s_lshl_b32 s7, s50, 16 +; SI-NEXT: s_or_b32 s10, s4, s78 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s88, s70, 16 +; SI-NEXT: s_or_b32 s89, s5, s7 +; SI-NEXT: s_and_b32 s5, s71, 0xffff +; SI-NEXT: s_lshl_b32 s7, s51, 16 +; SI-NEXT: s_or_b32 s8, s4, s88 +; SI-NEXT: s_and_b32 s4, s80, 0xffff +; SI-NEXT: s_lshl_b32 s90, s84, 16 +; SI-NEXT: s_or_b32 s91, s5, s7 +; SI-NEXT: s_and_b32 s5, s81, 0xffff +; SI-NEXT: s_lshl_b32 s7, s52, 16 +; SI-NEXT: s_or_b32 s6, s4, s90 +; SI-NEXT: s_and_b32 s4, s82, 0xffff +; SI-NEXT: s_lshl_b32 s58, s96, 16 +; SI-NEXT: s_or_b32 s59, s5, s7 +; SI-NEXT: s_and_b32 s5, s83, 0xffff +; SI-NEXT: s_lshl_b32 s7, s54, 16 +; SI-NEXT: s_or_b32 s4, s4, s58 +; SI-NEXT: s_lshl_b32 s56, s98, 16 +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_and_b32 s5, s87, 0xffff +; SI-NEXT: s_lshl_b32 s7, s53, 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[58:59], 16 +; SI-NEXT: s_and_b32 s58, s86, 0xffff +; SI-NEXT: s_lshl_b32 s46, s99, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s85, 0xffff +; SI-NEXT: s_lshl_b32 s7, s55, 16 +; SI-NEXT: s_or_b32 s58, s58, s56 +; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 +; SI-NEXT: s_and_b32 s56, s97, 0xffff +; SI-NEXT: s_or_b32 vcc_hi, s5, s7 +; SI-NEXT: s_lshl_b32 vcc_lo, s65, 16 +; SI-NEXT: s_mov_b32 s45, s61 +; SI-NEXT: s_lshr_b64 s[60:61], s[60:61], 16 +; SI-NEXT: s_mov_b32 s43, s63 +; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 16 +; SI-NEXT: s_or_b32 s56, s56, s46 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 +; SI-NEXT: s_and_b32 s46, s64, 0xffff +; SI-NEXT: s_mov_b32 s61, s9 +; SI-NEXT: s_mov_b32 s63, s11 +; SI-NEXT: s_mov_b32 s41, s73 +; SI-NEXT: s_lshr_b64 s[72:73], s[72:73], 16 +; SI-NEXT: s_mov_b32 s15, s75 +; SI-NEXT: s_lshr_b64 s[74:75], s[74:75], 16 +; SI-NEXT: s_mov_b32 s13, s77 +; SI-NEXT: s_lshr_b64 s[76:77], s[76:77], 16 +; SI-NEXT: s_mov_b32 s11, s79 +; SI-NEXT: s_lshr_b64 s[78:79], s[78:79], 16 +; SI-NEXT: s_mov_b32 s9, s89 +; SI-NEXT: s_lshr_b64 s[88:89], s[88:89], 16 +; SI-NEXT: s_mov_b32 s7, s91 +; SI-NEXT: s_lshr_b64 s[90:91], s[90:91], 16 +; SI-NEXT: s_mov_b32 s5, s59 +; SI-NEXT: s_mov_b32 s59, s57 +; SI-NEXT: s_mov_b32 s57, s47 +; SI-NEXT: s_or_b32 s46, s46, vcc_lo +; SI-NEXT: s_mov_b32 s47, vcc_hi +; SI-NEXT: s_lshr_b64 s[34:35], vcc, 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, s18 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v60 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v62 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s64, s64, 3 +; SI-NEXT: s_and_b32 s4, s64, 0xffff +; SI-NEXT: s_lshl_b32 s5, s65, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s85, s85, 3 +; SI-NEXT: s_add_i32 s46, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s85, 0xffff +; SI-NEXT: s_lshl_b32 s5, s55, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s97, s97, 3 +; SI-NEXT: s_add_i32 s47, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s97, 0xffff +; SI-NEXT: s_lshl_b32 s5, s99, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s87, s87, 3 +; SI-NEXT: s_add_i32 s56, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s87, 0xffff +; SI-NEXT: s_lshl_b32 s5, s53, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s86, s86, 3 +; SI-NEXT: s_add_i32 s57, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s86, 0xffff +; SI-NEXT: s_lshl_b32 s5, s98, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s83, s83, 3 +; SI-NEXT: s_add_i32 s58, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s83, 0xffff +; SI-NEXT: s_lshl_b32 s5, s54, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s82, s82, 3 +; SI-NEXT: s_add_i32 s59, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s82, 0xffff +; SI-NEXT: s_lshl_b32 s5, s96, 16 +; SI-NEXT: s_add_i32 s81, s81, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s81, 0xffff +; SI-NEXT: s_lshl_b32 s6, s52, 16 +; SI-NEXT: s_add_i32 s80, s80, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s80, 0xffff +; SI-NEXT: s_lshl_b32 s7, s84, 16 +; SI-NEXT: s_add_i32 s71, s71, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s71, 0xffff +; SI-NEXT: s_lshl_b32 s8, s51, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s28, 0xffff +; SI-NEXT: s_lshl_b32 s9, s70, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s29, 0xffff +; SI-NEXT: s_lshl_b32 s10, s50, 16 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s26, 0xffff +; SI-NEXT: s_lshl_b32 s11, s69, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s27, 0xffff +; SI-NEXT: s_lshl_b32 s12, s49, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s68, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s14, s48, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s22, 0xffff +; SI-NEXT: s_lshl_b32 s15, s67, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s23, 0xffff +; SI-NEXT: s_lshl_b32 s22, s39, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_or_b32 s15, s22, s15 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s22, s66, 16 +; SI-NEXT: s_or_b32 s20, s22, s20 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s40, s20, 0x30000 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s38, 16 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s41, s20, 0x30000 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s63, 16 +; SI-NEXT: s_or_b32 s18, s20, s18 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s7 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v21, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_add_i32 s42, s18, 0x30000 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s37, 16 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s43, s18, 0x30000 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s61, 16 +; SI-NEXT: s_or_b32 s16, s18, s16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s44, s16, 0x30000 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s36, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s45, s16, 0x30000 +; SI-NEXT: s_lshr_b64 s[60:61], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 +; SI-NEXT: s_lshr_b32 s36, s45, 16 +; SI-NEXT: s_lshr_b32 s37, s43, 16 +; SI-NEXT: s_lshr_b32 s38, s41, 16 +; SI-NEXT: s_lshr_b32 s39, s15, 16 +; SI-NEXT: s_lshr_b32 s48, s13, 16 +; SI-NEXT: s_lshr_b32 s49, s11, 16 +; SI-NEXT: s_lshr_b32 s50, s9, 16 +; SI-NEXT: s_lshr_b32 s51, s7, 16 +; SI-NEXT: s_lshr_b32 s52, s5, 16 +; SI-NEXT: s_lshr_b32 s54, s59, 16 +; SI-NEXT: s_lshr_b32 s53, s57, 16 +; SI-NEXT: s_lshr_b32 s55, s47, 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v57 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v45 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v33 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v34 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v41 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v48 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v44 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v27 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v53 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v31 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v55 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v35 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v46 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v42 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v43 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: s_and_b32 s16, s44, 0xffff +; SI-NEXT: s_lshl_b32 s17, s60, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s45, 0xffff +; SI-NEXT: s_lshl_b32 s18, s36, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s42, 0xffff +; SI-NEXT: s_lshl_b32 s19, s62, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s43, 0xffff +; SI-NEXT: s_lshl_b32 s20, s37, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s40, 0xffff +; SI-NEXT: s_lshl_b32 s21, s72, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_and_b32 s21, s41, 0xffff +; SI-NEXT: s_lshl_b32 s22, s38, 16 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s22, s74, 16 +; SI-NEXT: s_or_b32 s14, s14, s22 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s22, s39, 16 +; SI-NEXT: s_or_b32 s15, s15, s22 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s22, s76, 16 +; SI-NEXT: s_or_b32 s12, s12, s22 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s22, s48, 16 +; SI-NEXT: s_or_b32 s13, s13, s22 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s22, s78, 16 +; SI-NEXT: s_or_b32 s10, s10, s22 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s22, s49, 16 +; SI-NEXT: s_or_b32 s11, s11, s22 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s22, s88, 16 +; SI-NEXT: s_or_b32 s8, s8, s22 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s22, s50, 16 +; SI-NEXT: s_or_b32 s9, s9, s22 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s22, s90, 16 +; SI-NEXT: s_or_b32 s6, s6, s22 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s22, s51, 16 +; SI-NEXT: s_or_b32 s7, s7, s22 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s22, s92, 16 +; SI-NEXT: s_or_b32 s4, s4, s22 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s22, s52, 16 +; SI-NEXT: s_or_b32 s5, s5, s22 +; SI-NEXT: s_and_b32 s22, s58, 0xffff +; SI-NEXT: s_lshl_b32 s23, s94, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_and_b32 s23, s59, 0xffff +; SI-NEXT: s_lshl_b32 s24, s54, 16 +; SI-NEXT: s_or_b32 s23, s23, s24 +; SI-NEXT: s_and_b32 s24, s56, 0xffff +; SI-NEXT: s_lshl_b32 s25, s30, 16 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_and_b32 s25, s57, 0xffff +; SI-NEXT: s_lshl_b32 s26, s53, 16 +; SI-NEXT: s_or_b32 s25, s25, s26 +; SI-NEXT: s_and_b32 s26, s46, 0xffff +; SI-NEXT: s_lshl_b32 s27, s34, 16 +; SI-NEXT: s_or_b32 s26, s26, s27 +; SI-NEXT: s_and_b32 s27, s47, 0xffff +; SI-NEXT: s_lshl_b32 s28, s55, 16 +; SI-NEXT: s_or_b32 s27, s27, s28 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_mov_b32_e32 v10, s10 +; SI-NEXT: v_mov_b32_e32 v11, s11 +; SI-NEXT: v_mov_b32_e32 v12, s8 +; SI-NEXT: v_mov_b32_e32 v13, s9 +; SI-NEXT: v_mov_b32_e32 v14, s6 +; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v16, s4 +; SI-NEXT: v_mov_b32_e32 v17, s5 +; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_mov_b32_e32 v21, s25 +; SI-NEXT: v_mov_b32_e32 v22, s26 +; SI-NEXT: v_mov_b32_e32 v23, s27 +; SI-NEXT: v_readlane_b32 s99, v24, 35 +; SI-NEXT: v_readlane_b32 s98, v24, 34 +; SI-NEXT: v_readlane_b32 s97, v24, 33 +; SI-NEXT: v_readlane_b32 s96, v24, 32 +; SI-NEXT: v_readlane_b32 s87, v24, 31 +; SI-NEXT: v_readlane_b32 s86, v24, 30 +; SI-NEXT: v_readlane_b32 s85, v24, 29 +; SI-NEXT: v_readlane_b32 s84, v24, 28 +; SI-NEXT: v_readlane_b32 s83, v24, 27 +; SI-NEXT: v_readlane_b32 s82, v24, 26 +; SI-NEXT: v_readlane_b32 s81, v24, 25 +; SI-NEXT: v_readlane_b32 s80, v24, 24 +; SI-NEXT: v_readlane_b32 s71, v24, 23 +; SI-NEXT: v_readlane_b32 s70, v24, 22 +; SI-NEXT: v_readlane_b32 s69, v24, 21 +; SI-NEXT: v_readlane_b32 s68, v24, 20 +; SI-NEXT: v_readlane_b32 s67, v24, 19 +; SI-NEXT: v_readlane_b32 s66, v24, 18 +; SI-NEXT: v_readlane_b32 s65, v24, 17 +; SI-NEXT: v_readlane_b32 s64, v24, 16 +; SI-NEXT: v_readlane_b32 s55, v24, 15 +; SI-NEXT: v_readlane_b32 s54, v24, 14 +; SI-NEXT: v_readlane_b32 s53, v24, 13 +; SI-NEXT: v_readlane_b32 s52, v24, 12 +; SI-NEXT: v_readlane_b32 s51, v24, 11 +; SI-NEXT: v_readlane_b32 s50, v24, 10 +; SI-NEXT: v_readlane_b32 s49, v24, 9 +; SI-NEXT: v_readlane_b32 s48, v24, 8 +; SI-NEXT: v_readlane_b32 s39, v24, 7 +; SI-NEXT: v_readlane_b32 s38, v24, 6 +; SI-NEXT: v_readlane_b32 s37, v24, 5 +; SI-NEXT: v_readlane_b32 s36, v24, 4 +; SI-NEXT: v_readlane_b32 s35, v24, 3 +; SI-NEXT: v_readlane_b32 s34, v24, 2 +; SI-NEXT: v_readlane_b32 s31, v24, 1 +; SI-NEXT: v_readlane_b32 s30, v24, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: v_mov_b32_e32 v15, v12 -; SI-NEXT: v_mov_b32_e32 v14, v57 -; SI-NEXT: v_mov_b32_e32 v12, v47 -; SI-NEXT: v_mov_b32_e32 v10, v45 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v61, v43 -; SI-NEXT: v_mov_b32_e32 v60, v41 -; SI-NEXT: v_mov_b32_e32 v59, v55 -; SI-NEXT: v_mov_b32_e32 v58, v49 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v35 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v48i16_to_v48f16_scalar: @@ -38130,425 +35682,318 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v48i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v41 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v53 -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v51 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v50 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v48 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v39 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_or_b32_e32 v23, v23, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v25, v25, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v21, v21, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v22, v22, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_or_b32_e32 v19, v19, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v28, v28, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v17, v17, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v20, v20, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v15, v15, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_or_b32_e32 v16, v16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v13, v13, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v31, v31, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_or_b32_e32 v11, v11, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_or_b32_e32 v14, v14, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_or_b32_e32 v9, v9, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_or_b32_e32 v10, v10, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v34, v34, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_or_b32_e32 v8, v8, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 -; SI-NEXT: v_or_b32_e32 v36, v36, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v55 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v53 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v50 -; SI-NEXT: v_or_b32_e32 v2, v2, v39 -; SI-NEXT: v_or_b32_e32 v0, v0, v24 -; SI-NEXT: v_or_b32_e32 v35, v35, v54 -; SI-NEXT: v_or_b32_e32 v4, v4, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v24 +; SI-NEXT: v_or_b32_e32 v3, v3, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v54 +; SI-NEXT: v_or_b32_e32 v4, v4, v53 ; SI-NEXT: v_or_b32_e32 v6, v6, v52 -; SI-NEXT: v_or_b32_e32 v32, v32, v51 -; SI-NEXT: v_or_b32_e32 v33, v33, v42 -; SI-NEXT: v_or_b32_e32 v12, v12, v49 -; SI-NEXT: v_or_b32_e32 v29, v29, v48 -; SI-NEXT: v_or_b32_e32 v30, v30, v43 -; SI-NEXT: v_or_b32_e32 v18, v18, v38 -; SI-NEXT: v_or_b32_e32 v26, v26, v37 -; SI-NEXT: v_or_b32_e32 v27, v27, v44 -; SI-NEXT: v_alignbit_b32 v40, v2, v24, 16 -; SI-NEXT: v_alignbit_b32 v55, v36, v54, 16 -; SI-NEXT: v_alignbit_b32 v54, v8, v41, 16 -; SI-NEXT: v_alignbit_b32 v53, v34, v52, 16 -; SI-NEXT: v_alignbit_b32 v52, v10, v51, 16 -; SI-NEXT: v_alignbit_b32 v51, v14, v42, 16 -; SI-NEXT: v_alignbit_b32 v50, v31, v49, 16 -; SI-NEXT: v_alignbit_b32 v49, v16, v48, 16 -; SI-NEXT: v_alignbit_b32 v48, v20, v43, 16 -; SI-NEXT: v_alignbit_b32 v39, v28, v38, 16 -; SI-NEXT: v_alignbit_b32 v38, v22, v37, 16 -; SI-NEXT: v_alignbit_b32 v37, v25, v44, 16 +; SI-NEXT: v_or_b32_e32 v8, v8, v51 +; SI-NEXT: v_or_b32_e32 v10, v10, v50 +; SI-NEXT: v_or_b32_e32 v12, v12, v48 +; SI-NEXT: v_or_b32_e32 v14, v14, v39 +; SI-NEXT: v_or_b32_e32 v16, v16, v37 +; SI-NEXT: v_or_b32_e32 v18, v18, v34 +; SI-NEXT: v_or_b32_e32 v20, v20, v31 +; SI-NEXT: v_or_b32_e32 v22, v22, v28 +; SI-NEXT: v_alignbit_b32 v55, v1, v55, 16 +; SI-NEXT: v_alignbit_b32 v54, v3, v54, 16 +; SI-NEXT: v_alignbit_b32 v53, v5, v53, 16 +; SI-NEXT: v_alignbit_b32 v52, v7, v52, 16 +; SI-NEXT: v_alignbit_b32 v51, v9, v51, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v50, 16 +; SI-NEXT: v_alignbit_b32 v48, v13, v48, 16 +; SI-NEXT: v_alignbit_b32 v39, v15, v39, 16 +; SI-NEXT: v_alignbit_b32 v37, v17, v37, 16 +; SI-NEXT: v_alignbit_b32 v34, v19, v34, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v31, 16 +; SI-NEXT: v_alignbit_b32 v28, v23, v28, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v24 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v55 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v2, v2, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 -; SI-NEXT: v_or_b32_e32 v3, v24, v3 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v3, v3, v24 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v54 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v53 ; SI-NEXT: v_or_b32_e32 v4, v4, v24 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v11, v14, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; SI-NEXT: v_or_b32_e32 v8, v8, v24 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v24 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v8, v8, v24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v24 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v50 ; SI-NEXT: v_or_b32_e32 v10, v10, v24 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v49 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v17, v20, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v39 -; SI-NEXT: v_or_b32_e32 v14, v14, v24 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v48 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v12, v12, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v32 +; SI-NEXT: v_or_b32_e32 v13, v13, v24 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_or_b32_e32 v14, v14, v24 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_or_b32_e32 v15, v15, v24 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v37 ; SI-NEXT: v_or_b32_e32 v16, v16, v24 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v38 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v29 +; SI-NEXT: v_or_b32_e32 v17, v17, v24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v34 +; SI-NEXT: v_or_b32_e32 v18, v18, v24 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v27 +; SI-NEXT: v_or_b32_e32 v19, v19, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v31 ; SI-NEXT: v_or_b32_e32 v20, v20, v24 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v37 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_or_b32_e32 v21, v21, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v54 ; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v49 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -39013,439 +36458,369 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-LABEL: bitcast_v48f16_to_v48i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s14, s21, 16 -; SI-NEXT: s_lshr_b32 s40, s19, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s14 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: s_lshr_b32 s12, s23, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: s_lshr_b32 s6, s29, 16 -; SI-NEXT: s_lshr_b32 s8, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: v_mov_b32_e32 v24, v4 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: s_lshr_b32 s7, s28, 16 -; SI-NEXT: s_lshr_b32 s9, s26, 16 -; SI-NEXT: s_lshr_b32 s11, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s22, 16 -; SI-NEXT: s_lshr_b32 s15, s20, 16 -; SI-NEXT: s_lshr_b32 s41, s18, 16 -; SI-NEXT: s_lshr_b32 s42, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v7 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v20 +; SI-NEXT: s_lshr_b32 s8, s29, 16 +; SI-NEXT: s_lshr_b32 s6, s28, 16 +; SI-NEXT: s_lshr_b32 s10, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s12, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s22, 16 +; SI-NEXT: s_lshr_b32 s15, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s20, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: s_cbranch_scc0 .LBB59_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: s_cbranch_execnz .LBB59_4 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v50 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_or_b32_e32 v12, v12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 -; SI-NEXT: v_or_b32_e32 v27, v14, v2 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; SI-NEXT: v_or_b32_e32 v11, v6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s15 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s21 +; SI-NEXT: v_or_b32_e32 v13, v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s13 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v56 +; SI-NEXT: v_or_b32_e32 v20, v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s14 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v57 +; SI-NEXT: v_or_b32_e32 v26, v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s12 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v28, v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s10 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s7 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 +; SI-NEXT: v_or_b32_e32 v34, v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s8 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v36, v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v49 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 +; SI-NEXT: v_or_b32_e32 v7, v7, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v63, v15, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s20 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 -; SI-NEXT: v_or_b32_e32 v14, v14, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 -; SI-NEXT: v_or_b32_e32 v59, v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v34 -; SI-NEXT: v_or_b32_e32 v31, v16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_or_b32_e32 v3, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s24 +; SI-NEXT: v_or_b32_e32 v31, v16, v10 +; SI-NEXT: v_or_b32_e32 v43, v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v5 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v58, v14, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v32, v16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 +; SI-NEXT: v_or_b32_e32 v5, v6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_or_b32_e32 v61, v18, v27 +; SI-NEXT: v_or_b32_e32 v41, v15, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v23 +; SI-NEXT: v_or_b32_e32 v62, v16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v22 +; SI-NEXT: v_or_b32_e32 v42, v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v63 -; SI-NEXT: v_or_b32_e32 v62, v16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v63, v20, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v26 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v26, v22, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 -; SI-NEXT: v_or_b32_e32 v28, v22, v20 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v24 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v29, v25, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v60 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v40 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v24 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v25 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v60 -; SI-NEXT: v_or_b32_e32 v21, v21, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v41 -; SI-NEXT: v_or_b32_e32 v19, v19, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v46 -; SI-NEXT: v_or_b32_e32 v17, v17, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v24 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v25 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v61 -; SI-NEXT: v_or_b32_e32 v15, v15, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v57 -; SI-NEXT: v_or_b32_e32 v13, v13, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v56 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v42 -; SI-NEXT: v_or_b32_e32 v11, v11, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v24 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v24 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v56 -; SI-NEXT: v_or_b32_e32 v9, v9, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v45 -; SI-NEXT: v_or_b32_e32 v7, v7, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v24 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshr_b64 v[38:39], v[8:9], 16 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v47 -; SI-NEXT: v_mov_b32_e32 v39, v32 -; SI-NEXT: v_lshr_b64 v[36:37], v[10:11], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[14:15], 16 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v44 -; SI-NEXT: v_or_b32_e32 v3, v3, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v43 -; SI-NEXT: v_mov_b32_e32 v37, v29 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: v_mov_b32_e32 v33, v31 -; SI-NEXT: v_lshr_b64 v[30:31], v[16:17], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 -; SI-NEXT: v_or_b32_e32 v5, v5, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v24 -; SI-NEXT: v_mov_b32_e32 v31, v27 -; SI-NEXT: v_mov_b32_e32 v29, v26 -; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[22:23], 16 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_lshr_b64 v[54:55], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[2:3], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[4:5], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[6:7], 16 -; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshr_b64 v[29:30], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[27:28], 16 +; SI-NEXT: v_or_b32_e32 v23, v15, v2 +; SI-NEXT: v_or_b32_e32 v24, v16, v4 +; SI-NEXT: v_or_b32_e32 v37, v18, v8 +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v51, v48 +; SI-NEXT: v_lshr_b64 v[48:49], v[33:34], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[35:36], 16 +; SI-NEXT: v_mov_b32_e32 v35, v21 +; SI-NEXT: v_lshr_b64 v[15:16], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[6:7], 16 +; SI-NEXT: v_or_b32_e32 v45, v17, v6 +; SI-NEXT: v_lshr_b64 v[39:40], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[25:26], 16 +; SI-NEXT: v_mov_b32_e32 v49, v38 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[4:5], 16 +; SI-NEXT: v_mov_b32_e32 v22, v37 +; SI-NEXT: v_lshr_b64 v[37:38], v[8:9], 16 +; SI-NEXT: v_or_b32_e32 v14, v14, v0 +; SI-NEXT: s_branch .LBB59_5 +; SI-NEXT: .LBB59_3: +; SI-NEXT: s_branch .LBB59_2 +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v60, s8 +; SI-NEXT: v_mov_b32_e32 v59, s10 +; SI-NEXT: v_mov_b32_e32 v58, s12 +; SI-NEXT: v_mov_b32_e32 v57, s14 +; SI-NEXT: v_mov_b32_e32 v56, s15 +; SI-NEXT: v_mov_b32_e32 v46, s41 +; SI-NEXT: v_mov_b32_e32 v47, s40 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: v_mov_b32_e32 v11, s19 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, s28 +; SI-NEXT: v_mov_b32_e32 v41, s26 +; SI-NEXT: v_mov_b32_e32 v61, s24 +; SI-NEXT: v_mov_b32_e32 v42, s22 +; SI-NEXT: v_mov_b32_e32 v43, s20 +; SI-NEXT: v_mov_b32_e32 v21, v29 +; SI-NEXT: v_mov_b32_e32 v31, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, s16 +; SI-NEXT: v_mov_b32_e32 v26, s23 +; SI-NEXT: v_mov_b32_e32 v28, s25 +; SI-NEXT: v_mov_b32_e32 v34, s27 +; SI-NEXT: v_mov_b32_e32 v36, s29 +; SI-NEXT: v_mov_b32_e32 v39, s43 +; SI-NEXT: v_mov_b32_e32 v54, s42 +; SI-NEXT: v_mov_b32_e32 v29, s13 +; SI-NEXT: v_mov_b32_e32 v52, s11 +; SI-NEXT: v_mov_b32_e32 v50, s9 +; SI-NEXT: v_mov_b32_e32 v48, s7 +; SI-NEXT: v_mov_b32_e32 v32, s6 +; SI-NEXT: .LBB59_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v63 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; SI-NEXT: v_or_b32_e32 v27, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v31 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 +; SI-NEXT: v_or_b32_e32 v25, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v59 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_or_b32_e32 v29, v6, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v52 ; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v45 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v57 +; SI-NEXT: v_or_b32_e32 v26, v8, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 ; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v36 -; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v42 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v58 +; SI-NEXT: v_or_b32_e32 v28, v10, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v59 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v57 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v61 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v41 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v60 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v1, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v60 +; SI-NEXT: v_or_b32_e32 v16, v1, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: v_or_b32_e32 v17, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_or_b32_e32 v19, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -39462,12 +36837,24 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_or_b32_e32 v20, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v21, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_or_b32_e32 v22, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 +; SI-NEXT: v_or_b32_e32 v23, v1, v3 +; SI-NEXT: v_mov_b32_e32 v1, v27 +; SI-NEXT: v_mov_b32_e32 v3, v25 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: v_mov_b32_e32 v7, v26 +; SI-NEXT: v_mov_b32_e32 v9, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v48f16_to_v48i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll index 18de1fc68024e..911c911fa1ad4 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -6416,490 +6416,217 @@ end: define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v26i32_to_v52f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v37, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v50, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v37, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v50, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_mov_b32_e32 v51, v24 -; SI-NEXT: v_mov_b32_e32 v49, v25 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v63 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v60 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v47 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v56 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v49 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v54 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v53 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v40 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v42 +; SI-NEXT: v_or_b32_e32 v4, v4, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v53 +; SI-NEXT: v_or_b32_e32 v14, v14, v31 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v18, v18, v29 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v28 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v22, v22, v27 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v36 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v50 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v32 +; SI-NEXT: v_or_b32_e32 v15, v15, v31 +; SI-NEXT: v_or_b32_e32 v17, v17, v30 +; SI-NEXT: v_or_b32_e32 v19, v19, v29 +; SI-NEXT: v_or_b32_e32 v21, v21, v28 +; SI-NEXT: v_or_b32_e32 v23, v23, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7448,6 +7175,16 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-LABEL: bitcast_v26i32_to_v52f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v26, s30, 0 +; SI-NEXT: v_writelane_b32 v26, s31, 1 +; SI-NEXT: v_writelane_b32 v26, s34, 2 +; SI-NEXT: v_writelane_b32 v26, s35, 3 +; SI-NEXT: v_writelane_b32 v26, s36, 4 +; SI-NEXT: v_writelane_b32 v26, s37, 5 ; SI-NEXT: v_mov_b32_e32 v13, s16 ; SI-NEXT: v_mov_b32_e32 v14, s17 ; SI-NEXT: v_mov_b32_e32 v15, s18 @@ -7455,138 +7192,80 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v17, s20 ; SI-NEXT: v_mov_b32_e32 v18, s21 ; SI-NEXT: v_mov_b32_e32 v19, s22 -; SI-NEXT: v_readfirstlane_b32 s40, v13 +; SI-NEXT: v_writelane_b32 v26, s38, 6 +; SI-NEXT: v_readfirstlane_b32 s42, v13 ; SI-NEXT: v_mov_b32_e32 v13, s23 -; SI-NEXT: v_readfirstlane_b32 s41, v14 +; SI-NEXT: v_readfirstlane_b32 s43, v14 ; SI-NEXT: v_mov_b32_e32 v14, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v15 +; SI-NEXT: v_readfirstlane_b32 s40, v15 ; SI-NEXT: v_mov_b32_e32 v15, s25 -; SI-NEXT: v_readfirstlane_b32 s25, v16 +; SI-NEXT: v_readfirstlane_b32 s41, v16 ; SI-NEXT: v_mov_b32_e32 v16, s26 -; SI-NEXT: v_readfirstlane_b32 s26, v17 +; SI-NEXT: v_readfirstlane_b32 s24, v17 ; SI-NEXT: v_mov_b32_e32 v17, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v18 +; SI-NEXT: v_readfirstlane_b32 s25, v18 ; SI-NEXT: v_mov_b32_e32 v18, s28 -; SI-NEXT: v_readfirstlane_b32 s28, v19 +; SI-NEXT: v_readfirstlane_b32 s22, v19 ; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: v_readfirstlane_b32 s29, v13 -; SI-NEXT: v_readfirstlane_b32 s23, v14 -; SI-NEXT: v_readfirstlane_b32 s22, v15 -; SI-NEXT: v_readfirstlane_b32 s21, v16 -; SI-NEXT: v_readfirstlane_b32 s20, v17 -; SI-NEXT: v_readfirstlane_b32 s19, v18 -; SI-NEXT: v_readfirstlane_b32 s18, v19 -; SI-NEXT: v_readfirstlane_b32 s17, v0 -; SI-NEXT: v_readfirstlane_b32 s16, v1 -; SI-NEXT: v_readfirstlane_b32 s15, v2 -; SI-NEXT: v_readfirstlane_b32 s14, v3 -; SI-NEXT: v_readfirstlane_b32 s13, v4 -; SI-NEXT: v_readfirstlane_b32 s12, v5 -; SI-NEXT: v_readfirstlane_b32 s11, v6 -; SI-NEXT: v_readfirstlane_b32 s10, v7 -; SI-NEXT: v_readfirstlane_b32 s8, v8 +; SI-NEXT: v_writelane_b32 v26, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s23, v13 +; SI-NEXT: v_readfirstlane_b32 s20, v14 +; SI-NEXT: v_readfirstlane_b32 s21, v15 +; SI-NEXT: v_readfirstlane_b32 s18, v16 +; SI-NEXT: v_readfirstlane_b32 s19, v17 +; SI-NEXT: v_readfirstlane_b32 s16, v18 +; SI-NEXT: v_readfirstlane_b32 s17, v19 +; SI-NEXT: v_readfirstlane_b32 s14, v0 +; SI-NEXT: v_readfirstlane_b32 s15, v1 +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s13, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_readfirstlane_b32 s11, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v6 +; SI-NEXT: v_readfirstlane_b32 s9, v7 +; SI-NEXT: v_readfirstlane_b32 s6, v8 ; SI-NEXT: v_readfirstlane_b32 s7, v9 -; SI-NEXT: v_readfirstlane_b32 s6, v10 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v11 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v11 +; SI-NEXT: v_writelane_b32 v26, s48, 8 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s40 +; SI-NEXT: s_lshr_b32 s92, s5, 16 +; SI-NEXT: s_lshr_b32 s93, s7, 16 +; SI-NEXT: s_lshr_b32 s94, s9, 16 +; SI-NEXT: s_lshr_b32 s95, s11, 16 +; SI-NEXT: s_lshr_b32 s30, s13, 16 +; SI-NEXT: s_lshr_b32 s31, s15, 16 +; SI-NEXT: s_lshr_b32 s34, s17, 16 +; SI-NEXT: s_lshr_b32 s35, s19, 16 +; SI-NEXT: s_lshr_b32 s36, s21, 16 +; SI-NEXT: s_lshr_b32 s37, s23, 16 +; SI-NEXT: s_lshr_b32 s38, s25, 16 +; SI-NEXT: s_lshr_b32 s39, s41, 16 +; SI-NEXT: s_lshr_b32 s48, s43, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[42:43], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 ; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 @@ -7601,258 +7280,184 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_lshr_b32 s5, s41, 16 -; SI-NEXT: s_lshr_b32 s42, s24, 16 -; SI-NEXT: s_lshr_b32 s43, s25, 16 -; SI-NEXT: s_lshr_b32 s44, s26, 16 -; SI-NEXT: s_lshr_b32 s45, s27, 16 -; SI-NEXT: s_lshr_b32 s46, s28, 16 -; SI-NEXT: s_lshr_b32 s47, s29, 16 -; SI-NEXT: s_lshr_b32 s56, s23, 16 -; SI-NEXT: s_lshr_b32 s57, s22, 16 -; SI-NEXT: s_lshr_b32 s58, s21, 16 -; SI-NEXT: s_lshr_b32 s59, s20, 16 -; SI-NEXT: s_lshr_b32 s60, s19, 16 -; SI-NEXT: s_lshr_b32 s61, s18, 16 -; SI-NEXT: s_lshr_b32 s62, s17, 16 -; SI-NEXT: s_lshr_b32 s63, s16, 16 -; SI-NEXT: s_lshr_b32 s72, s15, 16 -; SI-NEXT: s_lshr_b32 s73, s14, 16 -; SI-NEXT: s_lshr_b32 s74, s13, 16 -; SI-NEXT: s_lshr_b32 s75, s12, 16 -; SI-NEXT: s_lshr_b32 s76, s11, 16 -; SI-NEXT: s_lshr_b32 s77, s10, 16 -; SI-NEXT: s_lshr_b32 s78, s8, 16 -; SI-NEXT: s_lshr_b32 s79, s7, 16 -; SI-NEXT: s_lshr_b32 s88, s6, 16 -; SI-NEXT: s_lshr_b32 s89, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s25 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v42, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v2, v40, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v40 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v5, v5, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v54 -; SI-NEXT: v_or_b32_e32 v7, v7, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 -; SI-NEXT: v_or_b32_e32 v9, v50, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v50 -; SI-NEXT: v_or_b32_e32 v11, v48, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v48 -; SI-NEXT: v_or_b32_e32 v13, v38, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 -; SI-NEXT: v_or_b32_e32 v15, v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 -; SI-NEXT: v_or_b32_e32 v17, v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 -; SI-NEXT: v_or_b32_e32 v19, v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 -; SI-NEXT: v_or_b32_e32 v21, v30, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v30 -; SI-NEXT: v_or_b32_e32 v23, v28, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v28 -; SI-NEXT: v_or_b32_e32 v4, v55, v4 -; SI-NEXT: v_or_b32_e32 v6, v53, v6 -; SI-NEXT: v_or_b32_e32 v8, v51, v8 -; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v12, v39, v12 -; SI-NEXT: v_or_b32_e32 v14, v37, v14 -; SI-NEXT: v_or_b32_e32 v16, v35, v16 -; SI-NEXT: v_or_b32_e32 v18, v33, v18 -; SI-NEXT: v_or_b32_e32 v20, v31, v20 -; SI-NEXT: v_or_b32_e32 v22, v29, v22 -; SI-NEXT: v_or_b32_e32 v24, v27, v24 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s92, s5, 16 +; SI-NEXT: s_lshr_b32 s93, s7, 16 +; SI-NEXT: s_lshr_b32 s94, s9, 16 +; SI-NEXT: s_lshr_b32 s95, s11, 16 +; SI-NEXT: s_lshr_b32 s30, s13, 16 +; SI-NEXT: s_lshr_b32 s31, s15, 16 +; SI-NEXT: s_lshr_b32 s34, s17, 16 +; SI-NEXT: s_lshr_b32 s35, s19, 16 +; SI-NEXT: s_lshr_b32 s36, s21, 16 +; SI-NEXT: s_lshr_b32 s37, s23, 16 +; SI-NEXT: s_lshr_b32 s38, s25, 16 +; SI-NEXT: s_lshr_b32 s39, s41, 16 +; SI-NEXT: s_lshr_b32 s48, s43, 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[42:43], 16 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_lshl_b32 s27, s88, 16 +; SI-NEXT: s_and_b32 s29, s42, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s43, 0xffff +; SI-NEXT: s_lshl_b32 s42, s48, 16 +; SI-NEXT: s_or_b32 s29, s29, s42 +; SI-NEXT: s_lshl_b32 s42, s78, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s42 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s42, s39, 16 +; SI-NEXT: s_or_b32 s41, s41, s42 +; SI-NEXT: s_lshl_b32 s42, s76, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s42 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s42, s38, 16 +; SI-NEXT: s_or_b32 s25, s25, s42 +; SI-NEXT: s_lshl_b32 s42, s74, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s42 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s42, s37, 16 +; SI-NEXT: s_or_b32 s23, s23, s42 +; SI-NEXT: s_lshl_b32 s42, s72, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s42 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s42, s36, 16 +; SI-NEXT: s_or_b32 s21, s21, s42 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s42, s62, 16 +; SI-NEXT: s_or_b32 s18, s18, s42 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s42, s35, 16 +; SI-NEXT: s_or_b32 s19, s19, s42 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s42, s60, 16 +; SI-NEXT: s_or_b32 s16, s16, s42 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s42, s34, 16 +; SI-NEXT: s_or_b32 s17, s17, s42 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s42, s58, 16 +; SI-NEXT: s_or_b32 s14, s14, s42 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s42, s31, 16 +; SI-NEXT: s_or_b32 s15, s15, s42 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s42, s56, 16 +; SI-NEXT: s_or_b32 s12, s12, s42 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s42, s30, 16 +; SI-NEXT: s_or_b32 s13, s13, s42 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s42, s46, 16 +; SI-NEXT: s_or_b32 s10, s10, s42 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s42, s95, 16 +; SI-NEXT: s_or_b32 s11, s11, s42 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s42, s44, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s42 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s42, s94, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s93, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s92, 16 +; SI-NEXT: s_or_b32 s9, s9, s42 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s40 +; SI-NEXT: v_mov_b32_e32 v3, s41 +; SI-NEXT: v_mov_b32_e32 v4, s24 +; SI-NEXT: v_mov_b32_e32 v5, s25 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s19 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: v_mov_b32_e32 v14, s14 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v16, s12 +; SI-NEXT: v_mov_b32_e32 v17, s13 +; SI-NEXT: v_mov_b32_e32 v18, s10 +; SI-NEXT: v_mov_b32_e32 v19, s11 +; SI-NEXT: v_mov_b32_e32 v20, s8 +; SI-NEXT: v_mov_b32_e32 v21, s9 +; SI-NEXT: v_mov_b32_e32 v22, s6 +; SI-NEXT: v_mov_b32_e32 v23, s7 +; SI-NEXT: v_mov_b32_e32 v24, s4 +; SI-NEXT: v_mov_b32_e32 v25, s5 +; SI-NEXT: v_readlane_b32 s48, v26, 8 +; SI-NEXT: v_readlane_b32 s39, v26, 7 +; SI-NEXT: v_readlane_b32 s38, v26, 6 +; SI-NEXT: v_readlane_b32 s37, v26, 5 +; SI-NEXT: v_readlane_b32 s36, v26, 4 +; SI-NEXT: v_readlane_b32 s35, v26, 3 +; SI-NEXT: v_readlane_b32 s34, v26, 2 +; SI-NEXT: v_readlane_b32 s31, v26, 1 +; SI-NEXT: v_readlane_b32 s30, v26, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v26i32_to_v52f16_scalar: @@ -8542,7 +8147,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v52f16_to_v26i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -8559,186 +8163,243 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_mov_b32_e32 v57, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v38, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v39, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v12 +; SI-NEXT: v_mov_b32_e32 v54, v11 +; SI-NEXT: v_mov_b32_e32 v55, v10 +; SI-NEXT: v_mov_b32_e32 v40, v9 +; SI-NEXT: v_mov_b32_e32 v41, v8 +; SI-NEXT: v_mov_b32_e32 v42, v7 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v44, v5 +; SI-NEXT: v_mov_b32_e32 v45, v4 +; SI-NEXT: v_mov_b32_e32 v46, v3 +; SI-NEXT: v_mov_b32_e32 v47, v2 +; SI-NEXT: v_mov_b32_e32 v56, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v57 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v53 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -8774,134 +8435,20 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v45 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 -; SI-NEXT: v_or_b32_e32 v2, v54, v2 -; SI-NEXT: v_or_b32_e32 v3, v52, v3 -; SI-NEXT: v_or_b32_e32 v4, v50, v4 -; SI-NEXT: v_or_b32_e32 v5, v48, v5 -; SI-NEXT: v_or_b32_e32 v6, v38, v6 -; SI-NEXT: v_or_b32_e32 v7, v36, v7 -; SI-NEXT: v_or_b32_e32 v8, v34, v8 -; SI-NEXT: v_or_b32_e32 v9, v32, v9 -; SI-NEXT: v_or_b32_e32 v10, v62, v10 -; SI-NEXT: v_or_b32_e32 v22, v58, v22 -; SI-NEXT: v_or_b32_e32 v23, v56, v23 -; SI-NEXT: v_or_b32_e32 v24, v46, v24 -; SI-NEXT: v_or_b32_e32 v25, v44, v25 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v60, v21 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB18_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -8914,10 +8461,10 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -8926,118 +8473,121 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v43 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v55 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -9045,98 +8595,98 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v60 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v46 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 @@ -9867,575 +9417,378 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-LABEL: bitcast_v52f16_to_v26i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v11 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v34, v9 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v36, v7 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v38, v5 +; SI-NEXT: v_mov_b32_e32 v39, v4 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_mov_b32_e32 v51, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v51 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v37, v2 -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_or_b32_e32 v4, v47, v4 -; SI-NEXT: v_mov_b32_e32 v47, v45 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_or_b32_e32 v5, v44, v5 -; SI-NEXT: v_or_b32_e32 v7, v34, v7 -; SI-NEXT: v_or_b32_e32 v8, v35, v8 -; SI-NEXT: v_or_b32_e32 v9, v42, v9 -; SI-NEXT: v_or_b32_e32 v10, v41, v10 -; SI-NEXT: v_or_b32_e32 v11, v54, v11 -; SI-NEXT: v_or_b32_e32 v12, v53, v12 -; SI-NEXT: v_or_b32_e32 v13, v51, v13 -; SI-NEXT: v_or_b32_e32 v14, v49, v14 -; SI-NEXT: v_or_b32_e32 v15, v39, v15 -; SI-NEXT: v_or_b32_e32 v16, v28, v16 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v29, v25 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v46 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v45 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v43 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v42 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v37 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v55 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: .LBB19_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v42, v52 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v41, v51 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v40, v50 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v55, v49 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v48 -; SI-NEXT: v_mov_b32_e32 v52, v26 -; SI-NEXT: v_mov_b32_e32 v51, v28 -; SI-NEXT: v_mov_b32_e32 v50, v27 -; SI-NEXT: v_mov_b32_e32 v49, v30 -; SI-NEXT: v_mov_b32_e32 v48, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v30, v49 -; SI-NEXT: v_mov_b32_e32 v27, v50 -; SI-NEXT: v_mov_b32_e32 v28, v51 -; SI-NEXT: v_mov_b32_e32 v26, v52 -; SI-NEXT: v_mov_b32_e32 v48, v54 -; SI-NEXT: v_mov_b32_e32 v49, v55 -; SI-NEXT: v_mov_b32_e32 v50, v40 -; SI-NEXT: v_mov_b32_e32 v51, v41 -; SI-NEXT: v_mov_b32_e32 v52, v42 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v47, v45 -; SI-NEXT: v_mov_b32_e32 v45, v44 ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v52f16_to_v26i32_scalar: @@ -16589,490 +15942,217 @@ end: define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v26f32_to_v52f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v37, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v50, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v37, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v50, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_mov_b32_e32 v51, v24 -; SI-NEXT: v_mov_b32_e32 v49, v25 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v42 +; SI-NEXT: v_or_b32_e32 v4, v4, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v63 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v60 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v47 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v56 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v49 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v54 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v53 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v40 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v53 +; SI-NEXT: v_or_b32_e32 v14, v14, v31 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v18, v18, v29 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v28 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v22, v22, v27 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v36 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v50 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v32 +; SI-NEXT: v_or_b32_e32 v15, v15, v31 +; SI-NEXT: v_or_b32_e32 v17, v17, v30 +; SI-NEXT: v_or_b32_e32 v19, v19, v29 +; SI-NEXT: v_or_b32_e32 v21, v21, v28 +; SI-NEXT: v_or_b32_e32 v23, v23, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -17596,489 +16676,260 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v28, s16 -; SI-NEXT: v_mov_b32_e32 v27, s17 -; SI-NEXT: v_mov_b32_e32 v26, s18 -; SI-NEXT: v_mov_b32_e32 v22, s19 -; SI-NEXT: v_mov_b32_e32 v24, s20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, s21 -; SI-NEXT: v_mov_b32_e32 v62, s22 -; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_mov_b32_e32 v24, s16 +; SI-NEXT: v_mov_b32_e32 v25, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v23, s23 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v30, s24 -; SI-NEXT: v_mov_b32_e32 v29, s25 -; SI-NEXT: v_mov_b32_e32 v21, s26 -; SI-NEXT: v_mov_b32_e32 v20, s27 -; SI-NEXT: v_mov_b32_e32 v23, s28 -; SI-NEXT: v_mov_b32_e32 v25, s29 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v12 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v23 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v12 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v26 +; SI-NEXT: v_lshr_b64 v[48:49], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_lshr_b64 v[38:39], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 +; SI-NEXT: v_lshr_b64 v[48:49], v[8:9], 16 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v63 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v62 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[49:50], v[6:7], 16 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[50:51], v[4:5], 16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshr_b64 v[51:52], v[2:3], 16 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_mov_b32_e32 v32, v11 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v60 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v40 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v46 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v39 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v43 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v55 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 -; SI-NEXT: v_or_b32_e32 v11, v19, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v36 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v32 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_or_b32_e32 v12, v18, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v20, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v15, v20, v15 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v20, v17 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v37 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[52:53], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v36, v24, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; SI-NEXT: v_or_b32_e32 v37, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v26, v20, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v58 +; SI-NEXT: v_or_b32_e32 v27, v20, v21 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v16, v20 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v57 +; SI-NEXT: v_or_b32_e32 v29, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56 +; SI-NEXT: v_or_b32_e32 v31, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; SI-NEXT: v_or_b32_e32 v32, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v47 +; SI-NEXT: v_or_b32_e32 v33, v16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v34 +; SI-NEXT: v_or_b32_e32 v34, v14, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 +; SI-NEXT: v_or_b32_e32 v35, v14, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v53 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v45 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v52 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v36 +; SI-NEXT: v_mov_b32_e32 v1, v37 +; SI-NEXT: v_mov_b32_e32 v2, v26 +; SI-NEXT: v_mov_b32_e32 v3, v27 +; SI-NEXT: v_mov_b32_e32 v4, v28 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: v_mov_b32_e32 v6, v30 +; SI-NEXT: v_mov_b32_e32 v7, v31 +; SI-NEXT: v_mov_b32_e32 v8, v32 +; SI-NEXT: v_mov_b32_e32 v9, v33 +; SI-NEXT: v_mov_b32_e32 v10, v34 +; SI-NEXT: v_mov_b32_e32 v11, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v49 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v33 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v26f32_to_v52f16_scalar: @@ -18846,7 +17697,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v52f16_to_v26f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -18863,186 +17713,243 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_mov_b32_e32 v57, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v38, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v39, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v12 +; SI-NEXT: v_mov_b32_e32 v54, v11 +; SI-NEXT: v_mov_b32_e32 v55, v10 +; SI-NEXT: v_mov_b32_e32 v40, v9 +; SI-NEXT: v_mov_b32_e32 v41, v8 +; SI-NEXT: v_mov_b32_e32 v42, v7 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v44, v5 +; SI-NEXT: v_mov_b32_e32 v45, v4 +; SI-NEXT: v_mov_b32_e32 v46, v3 +; SI-NEXT: v_mov_b32_e32 v47, v2 +; SI-NEXT: v_mov_b32_e32 v56, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v57 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v53 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -19078,134 +17985,20 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v45 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 -; SI-NEXT: v_or_b32_e32 v2, v54, v2 -; SI-NEXT: v_or_b32_e32 v3, v52, v3 -; SI-NEXT: v_or_b32_e32 v4, v50, v4 -; SI-NEXT: v_or_b32_e32 v5, v48, v5 -; SI-NEXT: v_or_b32_e32 v6, v38, v6 -; SI-NEXT: v_or_b32_e32 v7, v36, v7 -; SI-NEXT: v_or_b32_e32 v8, v34, v8 -; SI-NEXT: v_or_b32_e32 v9, v32, v9 -; SI-NEXT: v_or_b32_e32 v10, v62, v10 -; SI-NEXT: v_or_b32_e32 v22, v58, v22 -; SI-NEXT: v_or_b32_e32 v23, v56, v23 -; SI-NEXT: v_or_b32_e32 v24, v46, v24 -; SI-NEXT: v_or_b32_e32 v25, v44, v25 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v60, v21 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB34_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -19218,10 +18011,10 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -19230,118 +18023,121 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v43 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v55 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -19349,98 +18145,98 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v60 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v46 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 @@ -20171,575 +18967,378 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-LABEL: bitcast_v52f16_to_v26f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_mov_b32_e32 v32, v11 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v34, v9 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v36, v7 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v38, v5 +; SI-NEXT: v_mov_b32_e32 v39, v4 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_mov_b32_e32 v51, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v39 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v51 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v37, v2 -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_or_b32_e32 v4, v47, v4 -; SI-NEXT: v_mov_b32_e32 v47, v45 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_or_b32_e32 v5, v44, v5 -; SI-NEXT: v_or_b32_e32 v7, v34, v7 -; SI-NEXT: v_or_b32_e32 v8, v35, v8 -; SI-NEXT: v_or_b32_e32 v9, v42, v9 -; SI-NEXT: v_or_b32_e32 v10, v41, v10 -; SI-NEXT: v_or_b32_e32 v11, v54, v11 -; SI-NEXT: v_or_b32_e32 v12, v53, v12 -; SI-NEXT: v_or_b32_e32 v13, v51, v13 -; SI-NEXT: v_or_b32_e32 v14, v49, v14 -; SI-NEXT: v_or_b32_e32 v15, v39, v15 -; SI-NEXT: v_or_b32_e32 v16, v28, v16 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v29, v25 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v46 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v45 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v43 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v42 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v37 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v55 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: .LBB35_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v42, v52 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v41, v51 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v40, v50 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v55, v49 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v48 -; SI-NEXT: v_mov_b32_e32 v52, v26 -; SI-NEXT: v_mov_b32_e32 v51, v28 -; SI-NEXT: v_mov_b32_e32 v50, v27 -; SI-NEXT: v_mov_b32_e32 v49, v30 -; SI-NEXT: v_mov_b32_e32 v48, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v30, v49 -; SI-NEXT: v_mov_b32_e32 v27, v50 -; SI-NEXT: v_mov_b32_e32 v28, v51 -; SI-NEXT: v_mov_b32_e32 v26, v52 -; SI-NEXT: v_mov_b32_e32 v48, v54 -; SI-NEXT: v_mov_b32_e32 v49, v55 -; SI-NEXT: v_mov_b32_e32 v50, v40 -; SI-NEXT: v_mov_b32_e32 v51, v41 -; SI-NEXT: v_mov_b32_e32 v52, v42 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v47, v45 -; SI-NEXT: v_mov_b32_e32 v45, v44 ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v52f16_to_v26f32_scalar: @@ -26043,214 +24642,72 @@ end: define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v13i64_to_v52f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v36, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v49, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 @@ -26267,7 +24724,6 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 @@ -26278,263 +24734,133 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 ; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v36, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v49, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_mov_b32_e32 v51, v24 -; SI-NEXT: v_mov_b32_e32 v49, v25 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v42 +; SI-NEXT: v_or_b32_e32 v4, v4, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v63 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v60 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v47 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v56 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v49 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v54 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v53 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v40 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v13i64_to_v52f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v53 +; SI-NEXT: v_or_b32_e32 v14, v14, v31 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v18, v29 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v28 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v22, v22, v27 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v32 +; SI-NEXT: v_or_b32_e32 v15, v15, v31 +; SI-NEXT: v_or_b32_e32 v17, v17, v30 +; SI-NEXT: v_or_b32_e32 v19, v19, v29 +; SI-NEXT: v_or_b32_e32 v21, v21, v28 +; SI-NEXT: v_or_b32_e32 v23, v23, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v13i64_to_v52f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill @@ -27089,6 +25415,16 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-LABEL: bitcast_v13i64_to_v52f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v26, s30, 0 +; SI-NEXT: v_writelane_b32 v26, s31, 1 +; SI-NEXT: v_writelane_b32 v26, s34, 2 +; SI-NEXT: v_writelane_b32 v26, s35, 3 +; SI-NEXT: v_writelane_b32 v26, s36, 4 +; SI-NEXT: v_writelane_b32 v26, s37, 5 ; SI-NEXT: v_mov_b32_e32 v13, s16 ; SI-NEXT: v_mov_b32_e32 v14, s17 ; SI-NEXT: v_mov_b32_e32 v15, s18 @@ -27096,404 +25432,272 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v17, s20 ; SI-NEXT: v_mov_b32_e32 v18, s21 ; SI-NEXT: v_mov_b32_e32 v19, s22 -; SI-NEXT: v_readfirstlane_b32 s40, v13 +; SI-NEXT: v_writelane_b32 v26, s38, 6 +; SI-NEXT: v_readfirstlane_b32 s42, v13 ; SI-NEXT: v_mov_b32_e32 v13, s23 -; SI-NEXT: v_readfirstlane_b32 s41, v14 +; SI-NEXT: v_readfirstlane_b32 s43, v14 ; SI-NEXT: v_mov_b32_e32 v14, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v15 +; SI-NEXT: v_readfirstlane_b32 s40, v15 ; SI-NEXT: v_mov_b32_e32 v15, s25 -; SI-NEXT: v_readfirstlane_b32 s42, v16 +; SI-NEXT: v_readfirstlane_b32 s41, v16 ; SI-NEXT: v_mov_b32_e32 v16, s26 -; SI-NEXT: v_readfirstlane_b32 s25, v17 +; SI-NEXT: v_readfirstlane_b32 s24, v17 ; SI-NEXT: v_mov_b32_e32 v17, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v18 +; SI-NEXT: v_readfirstlane_b32 s25, v18 ; SI-NEXT: v_mov_b32_e32 v18, s28 -; SI-NEXT: v_readfirstlane_b32 s26, v19 +; SI-NEXT: v_readfirstlane_b32 s22, v19 ; SI-NEXT: v_mov_b32_e32 v19, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: v_readfirstlane_b32 s28, v13 -; SI-NEXT: v_readfirstlane_b32 s22, v14 -; SI-NEXT: v_readfirstlane_b32 s23, v15 -; SI-NEXT: v_readfirstlane_b32 s20, v16 -; SI-NEXT: v_readfirstlane_b32 s21, v17 -; SI-NEXT: v_readfirstlane_b32 s18, v18 -; SI-NEXT: v_readfirstlane_b32 s19, v19 -; SI-NEXT: v_readfirstlane_b32 s16, v0 -; SI-NEXT: v_readfirstlane_b32 s17, v1 -; SI-NEXT: v_readfirstlane_b32 s14, v2 -; SI-NEXT: v_readfirstlane_b32 s15, v3 -; SI-NEXT: v_readfirstlane_b32 s12, v4 -; SI-NEXT: v_readfirstlane_b32 s13, v5 -; SI-NEXT: v_readfirstlane_b32 s10, v6 -; SI-NEXT: v_readfirstlane_b32 s11, v7 -; SI-NEXT: v_readfirstlane_b32 s7, v8 -; SI-NEXT: v_readfirstlane_b32 s8, v9 -; SI-NEXT: v_readfirstlane_b32 s6, v10 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v11 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v26, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s23, v13 +; SI-NEXT: v_readfirstlane_b32 s20, v14 +; SI-NEXT: v_readfirstlane_b32 s21, v15 +; SI-NEXT: v_readfirstlane_b32 s18, v16 +; SI-NEXT: v_readfirstlane_b32 s19, v17 +; SI-NEXT: v_readfirstlane_b32 s16, v18 +; SI-NEXT: v_readfirstlane_b32 s17, v19 +; SI-NEXT: v_readfirstlane_b32 s14, v0 +; SI-NEXT: v_readfirstlane_b32 s15, v1 +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s13, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_readfirstlane_b32 s11, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v6 +; SI-NEXT: v_readfirstlane_b32 s9, v7 +; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: v_readfirstlane_b32 s7, v9 +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v11 +; SI-NEXT: v_writelane_b32 v26, s48, 8 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s40 +; SI-NEXT: s_lshr_b32 s92, s5, 16 +; SI-NEXT: s_lshr_b32 s93, s7, 16 +; SI-NEXT: s_lshr_b32 s94, s9, 16 +; SI-NEXT: s_lshr_b32 s95, s11, 16 +; SI-NEXT: s_lshr_b32 s30, s13, 16 +; SI-NEXT: s_lshr_b32 s31, s15, 16 +; SI-NEXT: s_lshr_b32 s34, s17, 16 +; SI-NEXT: s_lshr_b32 s35, s19, 16 +; SI-NEXT: s_lshr_b32 s36, s21, 16 +; SI-NEXT: s_lshr_b32 s37, s23, 16 +; SI-NEXT: s_lshr_b32 s38, s25, 16 +; SI-NEXT: s_lshr_b32 s39, s41, 16 +; SI-NEXT: s_lshr_b32 s48, s43, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[42:43], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s40, 3 -; SI-NEXT: s_addc_u32 s5, s41, 0 -; SI-NEXT: s_lshr_b32 s29, s4, 16 -; SI-NEXT: s_lshr_b32 s40, s5, 16 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s41, s42, 0 -; SI-NEXT: s_lshr_b32 s42, s24, 16 -; SI-NEXT: s_lshr_b32 s43, s41, 16 -; SI-NEXT: s_add_u32 s25, s25, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s44, s25, 16 -; SI-NEXT: s_lshr_b32 s45, s27, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s28, s28, 0 -; SI-NEXT: s_lshr_b32 s46, s26, 16 -; SI-NEXT: s_lshr_b32 s47, s28, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s56, s22, 16 -; SI-NEXT: s_lshr_b32 s57, s23, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s58, s20, 16 -; SI-NEXT: s_lshr_b32 s59, s21, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s60, s18, 16 -; SI-NEXT: s_lshr_b32 s61, s19, 16 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s62, s16, 16 -; SI-NEXT: s_lshr_b32 s63, s17, 16 -; SI-NEXT: s_add_u32 s14, s14, 3 -; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_lshr_b32 s72, s14, 16 -; SI-NEXT: s_lshr_b32 s73, s15, 16 -; SI-NEXT: s_add_u32 s12, s12, 3 -; SI-NEXT: s_addc_u32 s13, s13, 0 -; SI-NEXT: s_lshr_b32 s74, s12, 16 -; SI-NEXT: s_lshr_b32 s75, s13, 16 -; SI-NEXT: s_add_u32 s10, s10, 3 -; SI-NEXT: s_addc_u32 s11, s11, 0 -; SI-NEXT: s_lshr_b32 s76, s10, 16 -; SI-NEXT: s_lshr_b32 s77, s11, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_lshr_b32 s78, s7, 16 -; SI-NEXT: s_lshr_b32 s79, s8, 16 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_lshr_b32 s88, s6, 16 -; SI-NEXT: s_lshr_b32 s89, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s40 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v43, s29 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 +; SI-NEXT: s_lshr_b32 s92, s5, 16 +; SI-NEXT: s_lshr_b32 s93, s7, 16 +; SI-NEXT: s_lshr_b32 s94, s9, 16 +; SI-NEXT: s_lshr_b32 s95, s11, 16 +; SI-NEXT: s_lshr_b32 s30, s13, 16 +; SI-NEXT: s_lshr_b32 s31, s15, 16 +; SI-NEXT: s_lshr_b32 s34, s17, 16 +; SI-NEXT: s_lshr_b32 s35, s19, 16 +; SI-NEXT: s_lshr_b32 s36, s21, 16 +; SI-NEXT: s_lshr_b32 s37, s23, 16 +; SI-NEXT: s_lshr_b32 s38, s25, 16 +; SI-NEXT: s_lshr_b32 s39, s41, 16 +; SI-NEXT: s_lshr_b32 s48, s43, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[42:43], 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v2, v40, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v40 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v5, v5, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v54 -; SI-NEXT: v_or_b32_e32 v7, v7, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 -; SI-NEXT: v_or_b32_e32 v9, v50, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v50 -; SI-NEXT: v_or_b32_e32 v11, v48, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v48 -; SI-NEXT: v_or_b32_e32 v13, v38, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 -; SI-NEXT: v_or_b32_e32 v15, v36, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 -; SI-NEXT: v_or_b32_e32 v17, v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 -; SI-NEXT: v_or_b32_e32 v19, v32, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 -; SI-NEXT: v_or_b32_e32 v21, v30, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v30 -; SI-NEXT: v_or_b32_e32 v23, v28, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v28 -; SI-NEXT: v_or_b32_e32 v4, v55, v4 -; SI-NEXT: v_or_b32_e32 v6, v53, v6 -; SI-NEXT: v_or_b32_e32 v8, v51, v8 -; SI-NEXT: v_or_b32_e32 v10, v49, v10 -; SI-NEXT: v_or_b32_e32 v12, v39, v12 -; SI-NEXT: v_or_b32_e32 v14, v37, v14 -; SI-NEXT: v_or_b32_e32 v16, v35, v16 -; SI-NEXT: v_or_b32_e32 v18, v33, v18 -; SI-NEXT: v_or_b32_e32 v20, v31, v20 -; SI-NEXT: v_or_b32_e32 v22, v29, v22 -; SI-NEXT: v_or_b32_e32 v24, v27, v24 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: s_lshl_b32 s27, s88, 16 +; SI-NEXT: s_and_b32 s29, s42, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s43, 0xffff +; SI-NEXT: s_lshl_b32 s42, s48, 16 +; SI-NEXT: s_or_b32 s29, s29, s42 +; SI-NEXT: s_lshl_b32 s42, s78, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s42 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s42, s39, 16 +; SI-NEXT: s_or_b32 s41, s41, s42 +; SI-NEXT: s_lshl_b32 s42, s76, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s42 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s42, s38, 16 +; SI-NEXT: s_or_b32 s25, s25, s42 +; SI-NEXT: s_lshl_b32 s42, s74, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s42 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s42, s37, 16 +; SI-NEXT: s_or_b32 s23, s23, s42 +; SI-NEXT: s_lshl_b32 s42, s72, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s42 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s42, s36, 16 +; SI-NEXT: s_or_b32 s21, s21, s42 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s42, s62, 16 +; SI-NEXT: s_or_b32 s18, s18, s42 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s42, s35, 16 +; SI-NEXT: s_or_b32 s19, s19, s42 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s42, s60, 16 +; SI-NEXT: s_or_b32 s16, s16, s42 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s42, s34, 16 +; SI-NEXT: s_or_b32 s17, s17, s42 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s42, s58, 16 +; SI-NEXT: s_or_b32 s14, s14, s42 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s42, s31, 16 +; SI-NEXT: s_or_b32 s15, s15, s42 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s42, s56, 16 +; SI-NEXT: s_or_b32 s12, s12, s42 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s42, s30, 16 +; SI-NEXT: s_or_b32 s13, s13, s42 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s42, s46, 16 +; SI-NEXT: s_or_b32 s10, s10, s42 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s42, s95, 16 +; SI-NEXT: s_or_b32 s11, s11, s42 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s42, s44, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s42 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s42, s94, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s93, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s92, 16 +; SI-NEXT: s_or_b32 s9, s9, s42 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s40 +; SI-NEXT: v_mov_b32_e32 v3, s41 +; SI-NEXT: v_mov_b32_e32 v4, s24 +; SI-NEXT: v_mov_b32_e32 v5, s25 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s19 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: v_mov_b32_e32 v14, s14 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v16, s12 +; SI-NEXT: v_mov_b32_e32 v17, s13 +; SI-NEXT: v_mov_b32_e32 v18, s10 +; SI-NEXT: v_mov_b32_e32 v19, s11 +; SI-NEXT: v_mov_b32_e32 v20, s8 +; SI-NEXT: v_mov_b32_e32 v21, s9 +; SI-NEXT: v_mov_b32_e32 v22, s6 +; SI-NEXT: v_mov_b32_e32 v23, s7 +; SI-NEXT: v_mov_b32_e32 v24, s4 +; SI-NEXT: v_mov_b32_e32 v25, s5 +; SI-NEXT: v_readlane_b32 s48, v26, 8 +; SI-NEXT: v_readlane_b32 s39, v26, 7 +; SI-NEXT: v_readlane_b32 s38, v26, 6 +; SI-NEXT: v_readlane_b32 s37, v26, 5 +; SI-NEXT: v_readlane_b32 s36, v26, 4 +; SI-NEXT: v_readlane_b32 s35, v26, 3 +; SI-NEXT: v_readlane_b32 s34, v26, 2 +; SI-NEXT: v_readlane_b32 s31, v26, 1 +; SI-NEXT: v_readlane_b32 s30, v26, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v13i64_to_v52f16_scalar: @@ -28183,7 +26387,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v52f16_to_v13i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -28200,186 +26403,243 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v57, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v38, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v39, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v12 +; SI-NEXT: v_mov_b32_e32 v54, v11 +; SI-NEXT: v_mov_b32_e32 v55, v10 +; SI-NEXT: v_mov_b32_e32 v40, v9 +; SI-NEXT: v_mov_b32_e32 v41, v8 +; SI-NEXT: v_mov_b32_e32 v42, v7 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v44, v5 +; SI-NEXT: v_mov_b32_e32 v45, v4 +; SI-NEXT: v_mov_b32_e32 v46, v3 +; SI-NEXT: v_mov_b32_e32 v47, v2 +; SI-NEXT: v_mov_b32_e32 v56, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v57 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v53 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -28415,134 +26675,20 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v45 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 -; SI-NEXT: v_or_b32_e32 v2, v54, v2 -; SI-NEXT: v_or_b32_e32 v3, v52, v3 -; SI-NEXT: v_or_b32_e32 v4, v50, v4 -; SI-NEXT: v_or_b32_e32 v5, v48, v5 -; SI-NEXT: v_or_b32_e32 v6, v38, v6 -; SI-NEXT: v_or_b32_e32 v7, v36, v7 -; SI-NEXT: v_or_b32_e32 v8, v34, v8 -; SI-NEXT: v_or_b32_e32 v9, v32, v9 -; SI-NEXT: v_or_b32_e32 v10, v62, v10 -; SI-NEXT: v_or_b32_e32 v22, v58, v22 -; SI-NEXT: v_or_b32_e32 v23, v56, v23 -; SI-NEXT: v_or_b32_e32 v24, v46, v24 -; SI-NEXT: v_or_b32_e32 v25, v44, v25 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v60, v21 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -28555,10 +26701,10 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -28567,118 +26713,121 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v43 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v55 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -28686,98 +26835,98 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v60 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v46 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 @@ -29508,575 +27657,378 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-LABEL: bitcast_v52f16_to_v13i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_mov_b32_e32 v32, v11 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v34, v9 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v36, v7 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v38, v5 +; SI-NEXT: v_mov_b32_e32 v39, v4 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_mov_b32_e32 v51, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v51 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v37, v2 -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_or_b32_e32 v4, v47, v4 -; SI-NEXT: v_mov_b32_e32 v47, v45 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_or_b32_e32 v5, v44, v5 -; SI-NEXT: v_or_b32_e32 v7, v34, v7 -; SI-NEXT: v_or_b32_e32 v8, v35, v8 -; SI-NEXT: v_or_b32_e32 v9, v42, v9 -; SI-NEXT: v_or_b32_e32 v10, v41, v10 -; SI-NEXT: v_or_b32_e32 v11, v54, v11 -; SI-NEXT: v_or_b32_e32 v12, v53, v12 -; SI-NEXT: v_or_b32_e32 v13, v51, v13 -; SI-NEXT: v_or_b32_e32 v14, v49, v14 -; SI-NEXT: v_or_b32_e32 v15, v39, v15 -; SI-NEXT: v_or_b32_e32 v16, v28, v16 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v29, v25 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v46 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v45 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v43 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v42 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v37 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v55 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v42, v52 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v41, v51 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v40, v50 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v55, v49 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v48 -; SI-NEXT: v_mov_b32_e32 v52, v26 -; SI-NEXT: v_mov_b32_e32 v51, v28 -; SI-NEXT: v_mov_b32_e32 v50, v27 -; SI-NEXT: v_mov_b32_e32 v49, v30 -; SI-NEXT: v_mov_b32_e32 v48, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v30, v49 -; SI-NEXT: v_mov_b32_e32 v27, v50 -; SI-NEXT: v_mov_b32_e32 v28, v51 -; SI-NEXT: v_mov_b32_e32 v26, v52 -; SI-NEXT: v_mov_b32_e32 v48, v54 -; SI-NEXT: v_mov_b32_e32 v49, v55 -; SI-NEXT: v_mov_b32_e32 v50, v40 -; SI-NEXT: v_mov_b32_e32 v51, v41 -; SI-NEXT: v_mov_b32_e32 v52, v42 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v47, v45 -; SI-NEXT: v_mov_b32_e32 v45, v44 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v52f16_to_v13i64_scalar: @@ -34510,465 +32462,204 @@ end: define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v13f64_to_v52f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v36, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v49, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v28, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v29, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v32, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v33, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v36, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v39, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v49, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_mov_b32_e32 v51, v24 -; SI-NEXT: v_mov_b32_e32 v49, v25 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v42 +; SI-NEXT: v_or_b32_e32 v4, v4, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v63 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v60 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v47 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v56 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v49 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v54 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v53 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v40 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v53 +; SI-NEXT: v_or_b32_e32 v14, v14, v31 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v18, v29 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v28 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v22, v22, v27 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v32 +; SI-NEXT: v_or_b32_e32 v15, v15, v31 +; SI-NEXT: v_or_b32_e32 v17, v17, v30 +; SI-NEXT: v_or_b32_e32 v19, v19, v29 +; SI-NEXT: v_or_b32_e32 v21, v21, v28 +; SI-NEXT: v_or_b32_e32 v23, v23, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -35481,462 +33172,232 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v9 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; SI-NEXT: v_lshr_b64 v[48:49], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v5 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v11 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v10 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v1 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v0 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v14 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_lshr_b64 v[38:39], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_lshr_b64 v[48:49], v[8:9], 16 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_lshr_b64 v[49:50], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_lshr_b64 v[50:51], v[4:5], 16 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_lshr_b64 v[51:52], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_mov_b32_e32 v51, v10 -; SI-NEXT: v_mov_b32_e32 v49, v11 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 ; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v36, v24, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; SI-NEXT: v_or_b32_e32 v37, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v26, v20, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v58 +; SI-NEXT: v_or_b32_e32 v27, v20, v21 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v16, v20 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v57 +; SI-NEXT: v_or_b32_e32 v29, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56 +; SI-NEXT: v_or_b32_e32 v31, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; SI-NEXT: v_or_b32_e32 v32, v16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v47 +; SI-NEXT: v_or_b32_e32 v33, v16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v34 +; SI-NEXT: v_or_b32_e32 v34, v14, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 +; SI-NEXT: v_or_b32_e32 v35, v14, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v53 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v45 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v52 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v34 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v30 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v45 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v41 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v48 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v49 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v53 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v42 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v46 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v57 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v36 +; SI-NEXT: v_mov_b32_e32 v1, v37 +; SI-NEXT: v_mov_b32_e32 v2, v26 +; SI-NEXT: v_mov_b32_e32 v3, v27 +; SI-NEXT: v_mov_b32_e32 v4, v28 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: v_mov_b32_e32 v6, v30 +; SI-NEXT: v_mov_b32_e32 v7, v31 +; SI-NEXT: v_mov_b32_e32 v8, v32 +; SI-NEXT: v_mov_b32_e32 v9, v33 +; SI-NEXT: v_mov_b32_e32 v10, v34 +; SI-NEXT: v_mov_b32_e32 v11, v35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v13f64_to_v52f16_scalar: @@ -36678,7 +34139,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v52f16_to_v13f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -36695,186 +34155,243 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_mov_b32_e32 v57, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v38, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v39, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_mov_b32_e32 v48, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v12 +; SI-NEXT: v_mov_b32_e32 v54, v11 +; SI-NEXT: v_mov_b32_e32 v55, v10 +; SI-NEXT: v_mov_b32_e32 v40, v9 +; SI-NEXT: v_mov_b32_e32 v41, v8 +; SI-NEXT: v_mov_b32_e32 v42, v7 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v44, v5 +; SI-NEXT: v_mov_b32_e32 v45, v4 +; SI-NEXT: v_mov_b32_e32 v46, v3 +; SI-NEXT: v_mov_b32_e32 v47, v2 +; SI-NEXT: v_mov_b32_e32 v56, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v57 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v53 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -36910,134 +34427,20 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v45 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 -; SI-NEXT: v_or_b32_e32 v2, v54, v2 -; SI-NEXT: v_or_b32_e32 v3, v52, v3 -; SI-NEXT: v_or_b32_e32 v4, v50, v4 -; SI-NEXT: v_or_b32_e32 v5, v48, v5 -; SI-NEXT: v_or_b32_e32 v6, v38, v6 -; SI-NEXT: v_or_b32_e32 v7, v36, v7 -; SI-NEXT: v_or_b32_e32 v8, v34, v8 -; SI-NEXT: v_or_b32_e32 v9, v32, v9 -; SI-NEXT: v_or_b32_e32 v10, v62, v10 -; SI-NEXT: v_or_b32_e32 v22, v58, v22 -; SI-NEXT: v_or_b32_e32 v23, v56, v23 -; SI-NEXT: v_or_b32_e32 v24, v46, v24 -; SI-NEXT: v_or_b32_e32 v25, v44, v25 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v60, v21 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB54_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -37050,10 +34453,10 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -37062,118 +34465,121 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v38 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v43 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v55 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -37181,98 +34587,98 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v60 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v46 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 @@ -38003,575 +35409,378 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-LABEL: bitcast_v52f16_to_v13f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_mov_b32_e32 v32, v11 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v34, v9 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v36, v7 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v38, v5 +; SI-NEXT: v_mov_b32_e32 v39, v4 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_mov_b32_e32 v51, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v39 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v51 ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 -; SI-NEXT: v_or_b32_e32 v6, v29, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v37, v2 -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_or_b32_e32 v4, v47, v4 -; SI-NEXT: v_mov_b32_e32 v47, v45 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_or_b32_e32 v5, v44, v5 -; SI-NEXT: v_or_b32_e32 v7, v34, v7 -; SI-NEXT: v_or_b32_e32 v8, v35, v8 -; SI-NEXT: v_or_b32_e32 v9, v42, v9 -; SI-NEXT: v_or_b32_e32 v10, v41, v10 -; SI-NEXT: v_or_b32_e32 v11, v54, v11 -; SI-NEXT: v_or_b32_e32 v12, v53, v12 -; SI-NEXT: v_or_b32_e32 v13, v51, v13 -; SI-NEXT: v_or_b32_e32 v14, v49, v14 -; SI-NEXT: v_or_b32_e32 v15, v39, v15 -; SI-NEXT: v_or_b32_e32 v16, v28, v16 -; SI-NEXT: v_or_b32_e32 v17, v30, v17 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v29, v25 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s20 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s23 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s24 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s27 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v55 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v46 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v45 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v43 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v42 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v37 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v55 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v42, v52 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v41, v51 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v40, v50 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v55, v49 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v48 -; SI-NEXT: v_mov_b32_e32 v52, v26 -; SI-NEXT: v_mov_b32_e32 v51, v28 -; SI-NEXT: v_mov_b32_e32 v50, v27 -; SI-NEXT: v_mov_b32_e32 v49, v30 -; SI-NEXT: v_mov_b32_e32 v48, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v30, v49 -; SI-NEXT: v_mov_b32_e32 v27, v50 -; SI-NEXT: v_mov_b32_e32 v28, v51 -; SI-NEXT: v_mov_b32_e32 v26, v52 -; SI-NEXT: v_mov_b32_e32 v48, v54 -; SI-NEXT: v_mov_b32_e32 v49, v55 -; SI-NEXT: v_mov_b32_e32 v50, v40 -; SI-NEXT: v_mov_b32_e32 v51, v41 -; SI-NEXT: v_mov_b32_e32 v52, v42 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v47, v45 -; SI-NEXT: v_mov_b32_e32 v45, v44 ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v52f16_to_v13f64_scalar: @@ -39213,7 +36422,58 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v52i16_to_v52f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v32 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -39257,488 +36517,561 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v55 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v40 +; SI-NEXT: v_or_b32_e32 v59, v1, v26 +; SI-NEXT: v_mov_b32_e32 v26, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v59, v32, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_or_b32_e32 v57, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v57, v54, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v47, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v47, v41, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 +; SI-NEXT: v_or_b32_e32 v45, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v45, v46, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_or_b32_e32 v43, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v43, v58, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v42, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v42, v63, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_or_b32_e32 v55, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v32 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: .LBB56_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB56_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v44 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v56 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_alignbit_b32 v1, v55, v60, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v63 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v57 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v44 +; SI-NEXT: v_mov_b32_e32 v32, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v52, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v52, v56, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v49, v1, v29 +; SI-NEXT: v_alignbit_b32 v1, v49, v61, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v38, v1, v30 +; SI-NEXT: v_alignbit_b32 v1, v38, v62, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v36, v1, v51 +; SI-NEXT: v_alignbit_b32 v1, v36, v27, 16 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v34, v1, v48 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_alignbit_b32 v1, v34, v28, 16 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_or_b32_e32 v3, v1, v31 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_alignbit_b32 v1, v3, v44, 16 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v44, v24 +; SI-NEXT: v_add_i32_e32 v32, vcc, 0x30000, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v24, v31, v24 +; SI-NEXT: v_or_b32_e32 v22, v28, v22 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v22, v48, v22 +; SI-NEXT: v_or_b32_e32 v20, v27, v20 +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v20, v51, v20 +; SI-NEXT: v_or_b32_e32 v18, v62, v18 +; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v18, v30, v18 +; SI-NEXT: v_or_b32_e32 v16, v61, v16 +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v16, v29, v16 +; SI-NEXT: v_or_b32_e32 v14, v56, v14 +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v60, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v63, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v46, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v43 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v0, v59, v2, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v57, v4, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v47, v6, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v45, v8, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v43, v10, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v42, v12, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v55, v14, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v52, v16, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v49, v18, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v38, v20, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v36, v22, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v34, v24, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v26, v32, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v36 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -39755,172 +37088,50 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v31 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v30 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v33 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v35 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v37 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52i16_to_v52f16: @@ -40437,495 +37648,555 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-LABEL: bitcast_v52i16_to_v52f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_writelane_b32 v26, s30, 0 +; SI-NEXT: v_writelane_b32 v26, s31, 1 +; SI-NEXT: v_writelane_b32 v26, s34, 2 +; SI-NEXT: v_writelane_b32 v26, s35, 3 +; SI-NEXT: v_writelane_b32 v26, s36, 4 +; SI-NEXT: v_writelane_b32 v26, s37, 5 +; SI-NEXT: v_writelane_b32 v26, s38, 6 +; SI-NEXT: v_writelane_b32 v26, s39, 7 +; SI-NEXT: v_writelane_b32 v26, s48, 8 +; SI-NEXT: v_writelane_b32 v26, s49, 9 +; SI-NEXT: v_writelane_b32 v26, s50, 10 +; SI-NEXT: v_writelane_b32 v26, s51, 11 +; SI-NEXT: v_writelane_b32 v26, s52, 12 +; SI-NEXT: v_writelane_b32 v26, s53, 13 +; SI-NEXT: v_writelane_b32 v26, s54, 14 +; SI-NEXT: v_writelane_b32 v26, s55, 15 +; SI-NEXT: v_writelane_b32 v26, s64, 16 +; SI-NEXT: v_writelane_b32 v26, s65, 17 +; SI-NEXT: v_writelane_b32 v26, s66, 18 +; SI-NEXT: v_writelane_b32 v26, s67, 19 +; SI-NEXT: v_writelane_b32 v26, s68, 20 +; SI-NEXT: v_writelane_b32 v26, s69, 21 +; SI-NEXT: v_writelane_b32 v26, s70, 22 +; SI-NEXT: v_writelane_b32 v26, s71, 23 +; SI-NEXT: v_writelane_b32 v26, s80, 24 +; SI-NEXT: v_writelane_b32 v26, s81, 25 +; SI-NEXT: v_writelane_b32 v26, s82, 26 +; SI-NEXT: v_writelane_b32 v26, s83, 27 +; SI-NEXT: v_writelane_b32 v26, s84, 28 +; SI-NEXT: v_writelane_b32 v26, s85, 29 +; SI-NEXT: v_writelane_b32 v26, s86, 30 +; SI-NEXT: v_writelane_b32 v26, s87, 31 +; SI-NEXT: v_writelane_b32 v26, s96, 32 +; SI-NEXT: v_writelane_b32 v26, s97, 33 +; SI-NEXT: v_writelane_b32 v26, s98, 34 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_readfirstlane_b32 s71, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; SI-NEXT: v_readfirstlane_b32 s82, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_readfirstlane_b32 s98, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_readfirstlane_b32 s70, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_readfirstlane_b32 s87, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: s_lshr_b32 s54, s29, 16 +; SI-NEXT: s_lshr_b32 s91, s28, 16 +; SI-NEXT: s_lshr_b32 s53, s27, 16 +; SI-NEXT: s_lshr_b32 s89, s26, 16 +; SI-NEXT: s_lshr_b32 s52, s25, 16 +; SI-NEXT: s_lshr_b32 s79, s24, 16 +; SI-NEXT: s_lshr_b32 s51, s23, 16 +; SI-NEXT: s_lshr_b32 s77, s22, 16 +; SI-NEXT: s_lshr_b32 s50, s21, 16 +; SI-NEXT: s_lshr_b32 s75, s20, 16 +; SI-NEXT: s_lshr_b32 s49, s19, 16 +; SI-NEXT: s_lshr_b32 s73, s18, 16 +; SI-NEXT: s_lshr_b32 s48, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s16, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_writelane_b32 v26, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s80, v6 +; SI-NEXT: v_readfirstlane_b32 s97, v5 +; SI-NEXT: v_readfirstlane_b32 s99, v4 +; SI-NEXT: v_readfirstlane_b32 s31, v3 +; SI-NEXT: v_readfirstlane_b32 s96, v2 +; SI-NEXT: v_readfirstlane_b32 s93, v1 +; SI-NEXT: v_readfirstlane_b32 s95, v0 +; SI-NEXT: v_readfirstlane_b32 s66, v13 +; SI-NEXT: v_readfirstlane_b32 s85, v14 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_readfirstlane_b32 s68, v15 +; SI-NEXT: v_readfirstlane_b32 s86, v16 +; SI-NEXT: v_readfirstlane_b32 s67, v17 +; SI-NEXT: v_readfirstlane_b32 s84, v18 +; SI-NEXT: v_readfirstlane_b32 s65, v19 +; SI-NEXT: v_readfirstlane_b32 s83, v11 +; SI-NEXT: v_readfirstlane_b32 s64, v10 +; SI-NEXT: v_readfirstlane_b32 s81, v9 +; SI-NEXT: v_readfirstlane_b32 s55, v8 +; SI-NEXT: v_readfirstlane_b32 s69, v7 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s13 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s15 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s29 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_mov_b32_e32 v22, v16 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_mov_b32_e32 v23, v17 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_mov_b32_e32 v24, v18 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: v_mov_b32_e32 v25, v19 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v4 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_mov_b32_e32 v21, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v41 -; SI-NEXT: v_mov_b32_e32 v14, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v43 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 -; SI-NEXT: v_mov_b32_e32 v16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 -; SI-NEXT: v_mov_b32_e32 v17, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_mov_b32_e32 v18, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v10 -; SI-NEXT: v_mov_b32_e32 v19, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v11 -; SI-NEXT: v_mov_b32_e32 v20, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s48, 16 +; SI-NEXT: s_lshl_b32 s62, s63, 16 +; SI-NEXT: s_mov_b32 s9, s63 +; SI-NEXT: s_or_b32 s63, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s49, 16 +; SI-NEXT: s_lshl_b32 s72, s73, 16 +; SI-NEXT: s_mov_b32 s11, s73 +; SI-NEXT: s_or_b32 s73, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s50, 16 +; SI-NEXT: s_lshl_b32 s74, s75, 16 +; SI-NEXT: s_mov_b32 s88, s75 +; SI-NEXT: s_or_b32 s75, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s51, 16 +; SI-NEXT: s_lshl_b32 s76, s77, 16 +; SI-NEXT: s_lshl_b32 s60, s89, 16 +; SI-NEXT: s_lshl_b32 s58, s91, 16 +; SI-NEXT: s_mov_b32 s92, s91 +; SI-NEXT: s_mov_b32 s91, s89 +; SI-NEXT: s_mov_b32 s89, s77 +; SI-NEXT: s_or_b32 s77, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s52, 16 +; SI-NEXT: s_lshl_b32 s78, s79, 16 +; SI-NEXT: s_mov_b32 s90, s79 +; SI-NEXT: s_or_b32 s79, s5, s7 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s7, s53, 16 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_and_b32 s5, s29, 0xffff +; SI-NEXT: s_lshl_b32 s7, s54, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_or_b32 s59, s5, s7 +; SI-NEXT: s_and_b32 s5, s93, 0xffff +; SI-NEXT: s_lshl_b32 s7, s55, 16 +; SI-NEXT: s_or_b32 s14, s4, s62 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_and_b32 s5, s31, 0xffff +; SI-NEXT: s_lshl_b32 s7, s64, 16 +; SI-NEXT: s_or_b32 s12, s4, s72 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s97, 0xffff +; SI-NEXT: s_lshl_b32 s7, s65, 16 +; SI-NEXT: s_or_b32 s10, s4, s74 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s87, 0xffff +; SI-NEXT: s_lshl_b32 s7, s67, 16 +; SI-NEXT: s_or_b32 s8, s4, s76 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s98, 0xffff +; SI-NEXT: s_lshl_b32 s7, s68, 16 +; SI-NEXT: s_mov_b32 s15, s63 +; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 16 +; SI-NEXT: s_mov_b32 s13, s73 +; SI-NEXT: s_lshr_b64 s[72:73], s[72:73], 16 +; SI-NEXT: s_or_b32 s6, s4, s78 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s71, 0xffff +; SI-NEXT: s_lshl_b32 s7, s66, 16 +; SI-NEXT: s_mov_b32 s63, s9 +; SI-NEXT: s_mov_b32 s73, s11 +; SI-NEXT: s_mov_b32 s11, s75 +; SI-NEXT: s_lshr_b64 s[74:75], s[74:75], 16 +; SI-NEXT: s_mov_b32 s9, s77 +; SI-NEXT: s_lshr_b64 s[76:77], s[76:77], 16 +; SI-NEXT: s_or_b32 s4, s4, s60 +; SI-NEXT: s_or_b32 vcc_hi, s5, s7 +; SI-NEXT: s_mov_b32 s75, s88 +; SI-NEXT: s_mov_b32 s77, s89 +; SI-NEXT: s_mov_b32 s7, s79 +; SI-NEXT: s_lshr_b64 s[78:79], s[78:79], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[60:61], 16 +; SI-NEXT: s_and_b32 s60, s28, 0xffff +; SI-NEXT: s_lshl_b32 s56, s69, 16 +; SI-NEXT: s_mov_b32 s79, s90 +; SI-NEXT: s_mov_b32 s89, s91 +; SI-NEXT: s_or_b32 s60, s60, s58 +; SI-NEXT: s_lshr_b64 s[90:91], s[58:59], 16 +; SI-NEXT: s_and_b32 s58, s95, 0xffff +; SI-NEXT: s_lshl_b32 s46, s81, 16 +; SI-NEXT: s_mov_b32 s91, s92 +; SI-NEXT: s_or_b32 s58, s58, s56 +; SI-NEXT: s_mov_b32 s94, s93 +; SI-NEXT: s_lshr_b64 s[92:93], s[56:57], 16 +; SI-NEXT: s_and_b32 s56, s96, 0xffff +; SI-NEXT: s_lshl_b32 s44, s83, 16 +; SI-NEXT: s_mov_b32 s93, s94 +; SI-NEXT: s_or_b32 s56, s56, s46 +; SI-NEXT: s_mov_b32 s30, s95 +; SI-NEXT: s_lshr_b64 s[94:95], s[46:47], 16 +; SI-NEXT: s_and_b32 s46, s99, 0xffff +; SI-NEXT: s_lshl_b32 s42, s84, 16 +; SI-NEXT: s_mov_b32 s95, s30 +; SI-NEXT: s_or_b32 s46, s46, s44 +; SI-NEXT: s_mov_b32 s34, s31 +; SI-NEXT: s_lshr_b64 s[30:31], s[44:45], 16 +; SI-NEXT: s_and_b32 s44, s80, 0xffff +; SI-NEXT: s_lshl_b32 s40, s86, 16 +; SI-NEXT: s_mov_b32 s31, s34 +; SI-NEXT: s_or_b32 s44, s44, s42 +; SI-NEXT: s_lshr_b64 s[34:35], s[42:43], 16 +; SI-NEXT: s_and_b32 s42, s70, 0xffff +; SI-NEXT: s_lshl_b32 vcc_lo, s85, 16 +; SI-NEXT: s_or_b32 s42, s42, s40 +; SI-NEXT: s_lshr_b64 s[36:37], s[40:41], 16 +; SI-NEXT: s_and_b32 s40, s82, 0xffff +; SI-NEXT: s_mov_b32 s5, s61 +; SI-NEXT: s_mov_b32 s61, s59 +; SI-NEXT: s_mov_b32 s59, s57 +; SI-NEXT: s_mov_b32 s57, s47 +; SI-NEXT: s_mov_b32 s47, s45 +; SI-NEXT: s_mov_b32 s45, s43 +; SI-NEXT: s_mov_b32 s43, s41 +; SI-NEXT: s_or_b32 s40, s40, vcc_lo +; SI-NEXT: s_mov_b32 s41, vcc_hi +; SI-NEXT: s_lshr_b64 s[38:39], vcc, 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s6 -; SI-NEXT: s_add_i32 s8, s8, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s17 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v20 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s8 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s19 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v18 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s20 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s21 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s23 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s82, s82, 3 +; SI-NEXT: s_and_b32 s4, s82, 0xffff +; SI-NEXT: s_lshl_b32 s5, s85, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s71, s71, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s71, 0xffff +; SI-NEXT: s_lshl_b32 s5, s66, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s70, s70, 3 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s70, 0xffff +; SI-NEXT: s_lshl_b32 s5, s86, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s98, s98, 3 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s98, 0xffff +; SI-NEXT: s_lshl_b32 s5, s68, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s80, s80, 3 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s80, 0xffff +; SI-NEXT: s_lshl_b32 s5, s84, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s87, s87, 3 +; SI-NEXT: s_add_i32 s44, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s87, 0xffff +; SI-NEXT: s_lshl_b32 s5, s67, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s99, s99, 3 +; SI-NEXT: s_add_i32 s45, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s99, 0xffff +; SI-NEXT: s_lshl_b32 s5, s83, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s97, s97, 3 +; SI-NEXT: s_add_i32 s46, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s97, 0xffff +; SI-NEXT: s_lshl_b32 s5, s65, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s96, s96, 3 +; SI-NEXT: s_add_i32 s47, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s96, 0xffff +; SI-NEXT: s_lshl_b32 s5, s81, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s56, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s31, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s64, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s57, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s95, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s69, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s58, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s93, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s55, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s13 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v35, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_add_i32 s59, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s91, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s60, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s54, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s61, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s89, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s6, s53, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xffff +; SI-NEXT: s_lshl_b32 s7, s79, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s25, 0xffff +; SI-NEXT: s_lshl_b32 s8, s52, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s22, 0xffff +; SI-NEXT: s_lshl_b32 s9, s77, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s23, 0xffff +; SI-NEXT: s_lshl_b32 s10, s51, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s20, 0xffff +; SI-NEXT: s_lshl_b32 s11, s75, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s21, 0xffff +; SI-NEXT: s_lshl_b32 s12, s50, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s18, 0xffff +; SI-NEXT: s_lshl_b32 s13, s73, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s19, 0xffff +; SI-NEXT: s_lshl_b32 s14, s49, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s16, 0xffff +; SI-NEXT: s_lshl_b32 s15, s63, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s48, 16 +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[40:41], 16 +; SI-NEXT: s_lshr_b32 s48, s15, 16 +; SI-NEXT: s_lshr_b32 s49, s13, 16 +; SI-NEXT: s_lshr_b32 s50, s11, 16 +; SI-NEXT: s_lshr_b32 s51, s9, 16 +; SI-NEXT: s_lshr_b32 s52, s7, 16 +; SI-NEXT: s_lshr_b32 s53, s5, 16 +; SI-NEXT: s_lshr_b32 s54, s61, 16 +; SI-NEXT: s_lshr_b32 s55, s59, 16 +; SI-NEXT: s_lshr_b32 s64, s57, 16 +; SI-NEXT: s_lshr_b32 s65, s47, 16 +; SI-NEXT: s_lshr_b32 s67, s45, 16 +; SI-NEXT: s_lshr_b32 s68, s43, 16 +; SI-NEXT: s_lshr_b32 s66, s41, 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v63 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v47 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v38 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v32 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v29 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v53 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v33 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v56 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v40 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v57 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v41 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v59 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v43 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v45 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s62, 16 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s16, s48, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s16, s72, 16 +; SI-NEXT: s_or_b32 s12, s12, s16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s16, s49, 16 +; SI-NEXT: s_or_b32 s13, s13, s16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s16, s74, 16 +; SI-NEXT: s_or_b32 s10, s10, s16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s16, s50, 16 +; SI-NEXT: s_or_b32 s11, s11, s16 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s16, s76, 16 +; SI-NEXT: s_or_b32 s8, s8, s16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s16, s51, 16 +; SI-NEXT: s_or_b32 s9, s9, s16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s16, s78, 16 +; SI-NEXT: s_or_b32 s6, s6, s16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s16, s52, 16 +; SI-NEXT: s_or_b32 s7, s7, s16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s16, s88, 16 +; SI-NEXT: s_or_b32 s4, s4, s16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s16, s53, 16 +; SI-NEXT: s_or_b32 s5, s5, s16 +; SI-NEXT: s_and_b32 s16, s60, 0xffff +; SI-NEXT: s_lshl_b32 s17, s90, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s61, 0xffff +; SI-NEXT: s_lshl_b32 s18, s54, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s58, 0xffff +; SI-NEXT: s_lshl_b32 s19, s92, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s59, 0xffff +; SI-NEXT: s_lshl_b32 s20, s55, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s56, 0xffff +; SI-NEXT: s_lshl_b32 s21, s94, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_and_b32 s21, s57, 0xffff +; SI-NEXT: s_lshl_b32 s22, s64, 16 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_and_b32 s22, s46, 0xffff +; SI-NEXT: s_lshl_b32 s23, s30, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_and_b32 s23, s47, 0xffff +; SI-NEXT: s_lshl_b32 s24, s65, 16 +; SI-NEXT: s_or_b32 s23, s23, s24 +; SI-NEXT: s_and_b32 s24, s44, 0xffff +; SI-NEXT: s_lshl_b32 s25, s34, 16 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_and_b32 s25, s45, 0xffff +; SI-NEXT: s_lshl_b32 s26, s67, 16 +; SI-NEXT: s_or_b32 s25, s25, s26 +; SI-NEXT: s_and_b32 s26, s42, 0xffff +; SI-NEXT: s_lshl_b32 s27, s36, 16 +; SI-NEXT: s_or_b32 s26, s26, s27 +; SI-NEXT: s_and_b32 s27, s43, 0xffff +; SI-NEXT: s_lshl_b32 s28, s68, 16 +; SI-NEXT: s_or_b32 s27, s27, s28 +; SI-NEXT: s_and_b32 s28, s40, 0xffff +; SI-NEXT: s_lshl_b32 s29, s38, 16 +; SI-NEXT: s_or_b32 s28, s28, s29 +; SI-NEXT: s_and_b32 s29, s41, 0xffff +; SI-NEXT: s_lshl_b32 s40, s66, 16 +; SI-NEXT: s_or_b32 s29, s29, s40 +; SI-NEXT: v_mov_b32_e32 v0, s14 +; SI-NEXT: v_mov_b32_e32 v1, s15 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: v_mov_b32_e32 v3, s13 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v9, s7 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: v_mov_b32_e32 v11, s5 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_mov_b32_e32 v19, s23 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_mov_b32_e32 v21, s25 +; SI-NEXT: v_mov_b32_e32 v22, s26 +; SI-NEXT: v_mov_b32_e32 v23, s27 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: v_mov_b32_e32 v25, s29 +; SI-NEXT: v_readlane_b32 s99, v26, 35 +; SI-NEXT: v_readlane_b32 s98, v26, 34 +; SI-NEXT: v_readlane_b32 s97, v26, 33 +; SI-NEXT: v_readlane_b32 s96, v26, 32 +; SI-NEXT: v_readlane_b32 s87, v26, 31 +; SI-NEXT: v_readlane_b32 s86, v26, 30 +; SI-NEXT: v_readlane_b32 s85, v26, 29 +; SI-NEXT: v_readlane_b32 s84, v26, 28 +; SI-NEXT: v_readlane_b32 s83, v26, 27 +; SI-NEXT: v_readlane_b32 s82, v26, 26 +; SI-NEXT: v_readlane_b32 s81, v26, 25 +; SI-NEXT: v_readlane_b32 s80, v26, 24 +; SI-NEXT: v_readlane_b32 s71, v26, 23 +; SI-NEXT: v_readlane_b32 s70, v26, 22 +; SI-NEXT: v_readlane_b32 s69, v26, 21 +; SI-NEXT: v_readlane_b32 s68, v26, 20 +; SI-NEXT: v_readlane_b32 s67, v26, 19 +; SI-NEXT: v_readlane_b32 s66, v26, 18 +; SI-NEXT: v_readlane_b32 s65, v26, 17 +; SI-NEXT: v_readlane_b32 s64, v26, 16 +; SI-NEXT: v_readlane_b32 s55, v26, 15 +; SI-NEXT: v_readlane_b32 s54, v26, 14 +; SI-NEXT: v_readlane_b32 s53, v26, 13 +; SI-NEXT: v_readlane_b32 s52, v26, 12 +; SI-NEXT: v_readlane_b32 s51, v26, 11 +; SI-NEXT: v_readlane_b32 s50, v26, 10 +; SI-NEXT: v_readlane_b32 s49, v26, 9 +; SI-NEXT: v_readlane_b32 s48, v26, 8 +; SI-NEXT: v_readlane_b32 s39, v26, 7 +; SI-NEXT: v_readlane_b32 s38, v26, 6 +; SI-NEXT: v_readlane_b32 s37, v26, 5 +; SI-NEXT: v_readlane_b32 s36, v26, 4 +; SI-NEXT: v_readlane_b32 s35, v26, 3 +; SI-NEXT: v_readlane_b32 s34, v26, 2 +; SI-NEXT: v_readlane_b32 s31, v26, 1 +; SI-NEXT: v_readlane_b32 s30, v26, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; kill: killed $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: v_mov_b32_e32 v25, v19 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; kill: killed $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: v_mov_b32_e32 v24, v18 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; kill: killed $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: v_mov_b32_e32 v23, v17 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; kill: killed $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: v_mov_b32_e32 v22, v16 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; kill: killed $vgpr13 -; SI-NEXT: v_mov_b32_e32 v21, v14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: v_mov_b32_e32 v20, v63 -; SI-NEXT: v_mov_b32_e32 v19, v61 -; SI-NEXT: v_mov_b32_e32 v18, v47 -; SI-NEXT: v_mov_b32_e32 v17, v46 -; SI-NEXT: v_mov_b32_e32 v16, v45 -; SI-NEXT: v_mov_b32_e32 v14, v43 -; SI-NEXT: v_mov_b32_e32 v12, v41 -; SI-NEXT: ; kill: killed $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v52i16_to_v52f16_scalar: @@ -41749,7 +39020,6 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v52f16_to_v52i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill @@ -41759,137 +39029,35 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v45 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -41934,212 +39102,216 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v49 ; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v52 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_or_b32_e32 v25, v25, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v28, v28, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v24, v24, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_or_b32_e32 v21, v21, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v29, v29, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_or_b32_e32 v19, v19, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_or_b32_e32 v31, v31, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v18, v18, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v17, v17, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_or_b32_e32 v32, v32, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v13 +; SI-NEXT: v_or_b32_e32 v15, v15, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_or_b32_e32 v13, v13, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v34, v34, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; SI-NEXT: v_or_b32_e32 v12, v12, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v9 +; SI-NEXT: v_or_b32_e32 v11, v11, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_or_b32_e32 v35, v35, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v37, v37, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v5 -; SI-NEXT: v_or_b32_e32 v6, v6, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_or_b32_e32 v39, v39, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v7, v7, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v44 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v55 -; SI-NEXT: v_or_b32_e32 v2, v2, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 ; SI-NEXT: v_or_b32_e32 v0, v0, v26 -; SI-NEXT: v_or_b32_e32 v38, v38, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v43 ; SI-NEXT: v_or_b32_e32 v4, v4, v41 -; SI-NEXT: v_or_b32_e32 v8, v8, v40 -; SI-NEXT: v_or_b32_e32 v36, v36, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v40 +; SI-NEXT: v_or_b32_e32 v8, v8, v45 ; SI-NEXT: v_or_b32_e32 v10, v10, v54 -; SI-NEXT: v_or_b32_e32 v14, v14, v53 -; SI-NEXT: v_or_b32_e32 v33, v33, v46 +; SI-NEXT: v_or_b32_e32 v12, v12, v53 +; SI-NEXT: v_or_b32_e32 v14, v14, v46 ; SI-NEXT: v_or_b32_e32 v16, v16, v51 -; SI-NEXT: v_or_b32_e32 v20, v20, v50 -; SI-NEXT: v_or_b32_e32 v30, v30, v47 +; SI-NEXT: v_or_b32_e32 v18, v18, v50 +; SI-NEXT: v_or_b32_e32 v20, v20, v47 ; SI-NEXT: v_or_b32_e32 v22, v22, v48 -; SI-NEXT: v_or_b32_e32 v27, v27, v56 -; SI-NEXT: v_alignbit_b32 v44, v2, v26, 16 -; SI-NEXT: v_alignbit_b32 v43, v39, v43, 16 -; SI-NEXT: v_alignbit_b32 v42, v6, v41, 16 -; SI-NEXT: v_alignbit_b32 v41, v37, v40, 16 -; SI-NEXT: v_alignbit_b32 v40, v35, v45, 16 -; SI-NEXT: v_alignbit_b32 v55, v12, v54, 16 -; SI-NEXT: v_alignbit_b32 v54, v34, v53, 16 -; SI-NEXT: v_alignbit_b32 v53, v32, v46, 16 -; SI-NEXT: v_alignbit_b32 v52, v18, v51, 16 -; SI-NEXT: v_alignbit_b32 v51, v31, v50, 16 -; SI-NEXT: v_alignbit_b32 v50, v29, v47, 16 -; SI-NEXT: v_alignbit_b32 v49, v24, v48, 16 -; SI-NEXT: v_alignbit_b32 v48, v28, v56, 16 +; SI-NEXT: v_or_b32_e32 v24, v24, v56 +; SI-NEXT: v_alignbit_b32 v44, v1, v26, 16 +; SI-NEXT: v_alignbit_b32 v43, v3, v43, 16 +; SI-NEXT: v_alignbit_b32 v42, v5, v41, 16 +; SI-NEXT: v_alignbit_b32 v41, v7, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, v9, v45, 16 +; SI-NEXT: v_alignbit_b32 v55, v11, v54, 16 +; SI-NEXT: v_alignbit_b32 v54, v13, v53, 16 +; SI-NEXT: v_alignbit_b32 v53, v15, v46, 16 +; SI-NEXT: v_alignbit_b32 v52, v17, v51, 16 +; SI-NEXT: v_alignbit_b32 v51, v19, v50, 16 +; SI-NEXT: v_alignbit_b32 v50, v21, v47, 16 +; SI-NEXT: v_alignbit_b32 v49, v23, v48, 16 +; SI-NEXT: v_alignbit_b32 v48, v25, v56, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v44 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v26 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v43 ; SI-NEXT: v_or_b32_e32 v2, v2, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v26 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v42 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v26 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v36 +; SI-NEXT: v_or_b32_e32 v7, v7, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload @@ -42151,63 +39323,59 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v36 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v8, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v54 -; SI-NEXT: v_or_b32_e32 v9, v26, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v26 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v55 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v10, v10, v26 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v26 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v54 +; SI-NEXT: v_or_b32_e32 v12, v12, v26 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v14, v14, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 -; SI-NEXT: v_or_b32_e32 v15, v26, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v15, v26 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v52 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v16, v16, v26 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v31 +; SI-NEXT: v_or_b32_e32 v17, v17, v26 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v51 +; SI-NEXT: v_or_b32_e32 v18, v18, v26 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v30 +; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v50 ; SI-NEXT: v_or_b32_e32 v20, v20, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v26, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v29 +; SI-NEXT: v_or_b32_e32 v21, v21, v26 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v49 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v22, v22, v26 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v28 +; SI-NEXT: v_or_b32_e32 v23, v23, v26 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v48 ; SI-NEXT: v_or_b32_e32 v24, v24, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -42726,488 +39894,414 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-LABEL: bitcast_v52f16_to_v52i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 -; SI-NEXT: s_lshr_b32 s42, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s22 -; SI-NEXT: s_lshr_b32 s14, s21, 16 -; SI-NEXT: s_lshr_b32 s40, s19, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s40 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s26 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: s_lshr_b32 s12, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s12 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; SI-NEXT: s_lshr_b32 s6, s29, 16 -; SI-NEXT: s_lshr_b32 s8, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; SI-NEXT: v_mov_b32_e32 v28, v10 +; SI-NEXT: v_mov_b32_e32 v25, v8 +; SI-NEXT: v_mov_b32_e32 v26, v6 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: s_lshr_b32 s7, s28, 16 -; SI-NEXT: s_lshr_b32 s9, s26, 16 -; SI-NEXT: s_lshr_b32 s11, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s22, 16 -; SI-NEXT: s_lshr_b32 s15, s20, 16 -; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s28, 16 +; SI-NEXT: s_lshr_b32 s11, s27, 16 +; SI-NEXT: s_lshr_b32 s14, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s15, s24, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 ; SI-NEXT: s_lshr_b32 s43, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v45, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v13 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v22 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; SI-NEXT: s_cbranch_scc0 .LBB59_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: s_cbranch_execnz .LBB59_4 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v37 -; SI-NEXT: v_or_b32_e32 v20, v20, v0 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s40 +; SI-NEXT: v_or_b32_e32 v2, v4, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s24 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s26 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v51, v0, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s13 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s28 +; SI-NEXT: v_or_b32_e32 v34, v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_or_b32_e32 v59, v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v17, v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v16, v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 -; SI-NEXT: v_or_b32_e32 v38, v22, v2 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_or_b32_e32 v27, v6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v25 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v32 -; SI-NEXT: v_or_b32_e32 v32, v22, v4 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_or_b32_e32 v49, v20, v6 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 +; SI-NEXT: v_or_b32_e32 v26, v10, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v58 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v36, v20, v8 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v25, v13, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v28 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v28, v13, v10 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s27 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v22, v22, v10 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v55 -; SI-NEXT: v_or_b32_e32 v22, v22, v12 -; SI-NEXT: v_or_b32_e32 v20, v20, v14 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v48, v24, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 -; SI-NEXT: v_or_b32_e32 v50, v20, v18 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v28, v24, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v51 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v34, v27, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v29 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v27 -; SI-NEXT: v_or_b32_e32 v37, v26, v24 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v29 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v58 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v30 -; SI-NEXT: v_or_b32_e32 v23, v23, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v35 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v26 -; SI-NEXT: v_lshr_b64 v[46:47], v[22:23], 16 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 -; SI-NEXT: v_or_b32_e32 v21, v21, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v27 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v31 -; SI-NEXT: v_or_b32_e32 v19, v19, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v56 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v62 -; SI-NEXT: v_or_b32_e32 v17, v17, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v45 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v26 -; SI-NEXT: v_mov_b32_e32 v47, v28 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v56 -; SI-NEXT: v_or_b32_e32 v15, v15, v26 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v27 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v45 -; SI-NEXT: v_or_b32_e32 v13, v13, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v44 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v59 -; SI-NEXT: v_or_b32_e32 v11, v11, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v57 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v44 -; SI-NEXT: v_or_b32_e32 v9, v9, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v63 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v57 -; SI-NEXT: v_or_b32_e32 v7, v7, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v60 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v27 -; SI-NEXT: v_lshr_b64 v[52:53], v[6:7], 16 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v61 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v63 -; SI-NEXT: v_or_b32_e32 v5, v5, v27 -; SI-NEXT: v_lshr_b64 v[54:55], v[4:5], 16 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v60 -; SI-NEXT: v_or_b32_e32 v3, v3, v26 -; SI-NEXT: v_lshr_b64 v[40:41], v[2:3], 16 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v26 -; SI-NEXT: v_lshr_b64 v[42:43], v[0:1], 16 -; SI-NEXT: v_mov_b32_e32 v43, v50 -; SI-NEXT: v_mov_b32_e32 v41, v49 -; SI-NEXT: v_mov_b32_e32 v55, v48 -; SI-NEXT: v_lshr_b64 v[50:51], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[10:11], 16 -; SI-NEXT: v_mov_b32_e32 v53, v38 -; SI-NEXT: v_mov_b32_e32 v51, v37 -; SI-NEXT: v_mov_b32_e32 v49, v36 -; SI-NEXT: v_lshr_b64 v[38:39], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[14:15], 16 -; SI-NEXT: v_mov_b32_e32 v39, v34 -; SI-NEXT: v_mov_b32_e32 v37, v32 -; SI-NEXT: v_lshr_b64 v[34:35], v[16:17], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v35, v31 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_lshr_b64 v[30:31], v[20:21], 16 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_lshr_b64 v[26:27], v[24:25], 16 -; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v61 +; SI-NEXT: v_or_b32_e32 v50, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 +; SI-NEXT: v_or_b32_e32 v39, v18, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_or_b32_e32 v31, v13, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v19 +; SI-NEXT: v_or_b32_e32 v33, v18, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s7 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v63 +; SI-NEXT: v_lshr_b64 v[53:54], v[38:39], 16 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_or_b32_e32 v24, v15, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s17 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v56 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v57 +; SI-NEXT: v_or_b32_e32 v15, v18, v15 +; SI-NEXT: v_lshr_b64 v[46:47], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v14, v34 +; SI-NEXT: v_lshr_b64 v[34:35], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[4:5], 16 +; SI-NEXT: v_mov_b32_e32 v35, v20 +; SI-NEXT: v_mov_b32_e32 v19, v21 +; SI-NEXT: v_lshr_b64 v[20:21], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v12, v51 +; SI-NEXT: v_lshr_b64 v[44:45], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[49:50], 16 +; SI-NEXT: v_mov_b32_e32 v21, v22 +; SI-NEXT: v_lshr_b64 v[22:23], v[8:9], 16 +; SI-NEXT: v_mov_b32_e32 v52, v37 +; SI-NEXT: v_lshr_b64 v[37:38], v[2:3], 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v23, v59 +; SI-NEXT: v_lshr_b64 v[58:59], v[10:11], 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[42:43], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[30:31], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[0:1], 16 +; SI-NEXT: s_branch .LBB59_5 +; SI-NEXT: .LBB59_3: +; SI-NEXT: s_branch .LBB59_2 +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, s12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, s11 +; SI-NEXT: v_mov_b32_e32 v55, s10 +; SI-NEXT: v_mov_b32_e32 v29, s9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, s8 +; SI-NEXT: v_mov_b32_e32 v56, s7 +; SI-NEXT: v_mov_b32_e32 v57, s6 +; SI-NEXT: v_mov_b32_e32 v15, s17 +; SI-NEXT: v_mov_b32_e32 v13, s19 +; SI-NEXT: v_mov_b32_e32 v24, s21 +; SI-NEXT: v_mov_b32_e32 v33, s23 +; SI-NEXT: v_mov_b32_e32 v31, s25 +; SI-NEXT: v_mov_b32_e32 v23, s28 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v39, s27 +; SI-NEXT: v_mov_b32_e32 v50, s29 +; SI-NEXT: v_mov_b32_e32 v46, s43 +; SI-NEXT: v_mov_b32_e32 v34, s42 +; SI-NEXT: v_mov_b32_e32 v44, s41 +; SI-NEXT: v_mov_b32_e32 v42, s40 +; SI-NEXT: v_mov_b32_e32 v40, s15 +; SI-NEXT: v_mov_b32_e32 v53, s14 +; SI-NEXT: v_mov_b32_e32 v51, s13 +; SI-NEXT: .LBB59_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v57 +; SI-NEXT: v_or_b32_e32 v30, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v56 +; SI-NEXT: v_or_b32_e32 v34, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v52 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_or_b32_e32 v32, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v57 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v50 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v44 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v48 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v36 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v32 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v46 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v55 +; SI-NEXT: v_or_b32_e32 v31, v10, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 ; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v59 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v45 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v62 +; SI-NEXT: v_or_b32_e32 v33, v12, v13 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v51 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v61 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v56 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v62 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v58 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 +; SI-NEXT: v_or_b32_e32 v15, v1, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v37 +; SI-NEXT: v_or_b32_e32 v16, v1, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_or_b32_e32 v17, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v23, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -43224,19 +40318,19 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v24, v24, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v29 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v24, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_or_b32_e32 v25, v1, v3 +; SI-NEXT: v_mov_b32_e32 v1, v30 +; SI-NEXT: v_mov_b32_e32 v3, v34 +; SI-NEXT: v_mov_b32_e32 v5, v32 +; SI-NEXT: v_mov_b32_e32 v7, v29 +; SI-NEXT: v_mov_b32_e32 v9, v31 +; SI-NEXT: v_mov_b32_e32 v11, v33 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v52f16_to_v52i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index b42188f0f3980..9f342f95cd8ee 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -6903,555 +6903,245 @@ end: define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v28i32_to_v56f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v38, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v48, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v38, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v48, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_mov_b32_e32 v46, v24 -; SI-NEXT: v_mov_b32_e32 v45, v25 -; SI-NEXT: v_mov_b32_e32 v43, v26 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v46 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v44 +; SI-NEXT: v_or_b32_e32 v8, v8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v12, v12, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v47 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v42 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v31 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v61 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v44 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v58 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v46 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v43 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v14, v14, v34 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v20, v20, v31 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v50 +; SI-NEXT: v_or_b32_e32 v22, v22, v30 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v24, v24, v29 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v39 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v48 +; SI-NEXT: v_or_b32_e32 v9, v9, v38 +; SI-NEXT: v_or_b32_e32 v11, v11, v36 +; SI-NEXT: v_or_b32_e32 v13, v13, v35 +; SI-NEXT: v_or_b32_e32 v15, v15, v34 +; SI-NEXT: v_or_b32_e32 v17, v17, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v21, v21, v31 +; SI-NEXT: v_or_b32_e32 v23, v23, v30 +; SI-NEXT: v_or_b32_e32 v25, v25, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8050,160 +7740,108 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-LABEL: bitcast_v28i32_to_v56f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v28, s30, 0 +; SI-NEXT: v_writelane_b32 v28, s31, 1 +; SI-NEXT: v_writelane_b32 v28, s34, 2 +; SI-NEXT: v_writelane_b32 v28, s35, 3 +; SI-NEXT: v_writelane_b32 v28, s36, 4 +; SI-NEXT: v_writelane_b32 v28, s37, 5 +; SI-NEXT: v_writelane_b32 v28, s38, 6 +; SI-NEXT: v_writelane_b32 v28, s39, 7 +; SI-NEXT: v_writelane_b32 v28, s48, 8 +; SI-NEXT: v_writelane_b32 v28, s49, 9 ; SI-NEXT: v_mov_b32_e32 v15, s16 ; SI-NEXT: v_mov_b32_e32 v16, s17 ; SI-NEXT: v_mov_b32_e32 v17, s18 ; SI-NEXT: v_mov_b32_e32 v18, s19 +; SI-NEXT: v_writelane_b32 v28, s50, 10 ; SI-NEXT: v_mov_b32_e32 v19, s20 -; SI-NEXT: v_readfirstlane_b32 s40, v15 +; SI-NEXT: v_readfirstlane_b32 s44, v15 ; SI-NEXT: v_mov_b32_e32 v15, s21 -; SI-NEXT: v_readfirstlane_b32 s41, v16 +; SI-NEXT: v_readfirstlane_b32 s45, v16 ; SI-NEXT: v_mov_b32_e32 v16, s22 ; SI-NEXT: v_readfirstlane_b32 s42, v17 ; SI-NEXT: v_mov_b32_e32 v17, s23 ; SI-NEXT: v_readfirstlane_b32 s43, v18 ; SI-NEXT: v_mov_b32_e32 v18, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v19 +; SI-NEXT: v_writelane_b32 v28, s51, 11 +; SI-NEXT: v_readfirstlane_b32 s40, v19 ; SI-NEXT: v_mov_b32_e32 v19, s25 -; SI-NEXT: v_readfirstlane_b32 s25, v15 +; SI-NEXT: v_readfirstlane_b32 s41, v15 ; SI-NEXT: v_mov_b32_e32 v15, s26 -; SI-NEXT: v_readfirstlane_b32 s26, v16 +; SI-NEXT: v_readfirstlane_b32 s24, v16 ; SI-NEXT: v_mov_b32_e32 v16, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v17 +; SI-NEXT: v_readfirstlane_b32 s25, v17 ; SI-NEXT: v_mov_b32_e32 v17, s28 -; SI-NEXT: v_readfirstlane_b32 s28, v18 +; SI-NEXT: v_readfirstlane_b32 s22, v18 ; SI-NEXT: v_mov_b32_e32 v18, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_readfirstlane_b32 s29, v19 -; SI-NEXT: v_readfirstlane_b32 s23, v15 -; SI-NEXT: v_readfirstlane_b32 s22, v16 -; SI-NEXT: v_readfirstlane_b32 s21, v17 -; SI-NEXT: v_readfirstlane_b32 s20, v18 -; SI-NEXT: v_readfirstlane_b32 s19, v0 -; SI-NEXT: v_readfirstlane_b32 s18, v1 -; SI-NEXT: v_readfirstlane_b32 s17, v2 -; SI-NEXT: v_readfirstlane_b32 s16, v3 -; SI-NEXT: v_readfirstlane_b32 s15, v4 -; SI-NEXT: v_readfirstlane_b32 s14, v5 -; SI-NEXT: v_readfirstlane_b32 s13, v6 -; SI-NEXT: v_readfirstlane_b32 s12, v7 -; SI-NEXT: v_readfirstlane_b32 s11, v8 -; SI-NEXT: v_readfirstlane_b32 s10, v9 -; SI-NEXT: v_readfirstlane_b32 s8, v10 +; SI-NEXT: v_writelane_b32 v28, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s23, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v15 +; SI-NEXT: v_readfirstlane_b32 s21, v16 +; SI-NEXT: v_readfirstlane_b32 s18, v17 +; SI-NEXT: v_readfirstlane_b32 s19, v18 +; SI-NEXT: v_readfirstlane_b32 s16, v0 +; SI-NEXT: v_readfirstlane_b32 s17, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v2 +; SI-NEXT: v_readfirstlane_b32 s15, v3 +; SI-NEXT: v_readfirstlane_b32 s12, v4 +; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: v_readfirstlane_b32 s10, v6 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v8 +; SI-NEXT: v_readfirstlane_b32 s9, v9 +; SI-NEXT: v_readfirstlane_b32 s6, v10 ; SI-NEXT: v_readfirstlane_b32 s7, v11 -; SI-NEXT: v_readfirstlane_b32 s6, v12 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v13 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v13 +; SI-NEXT: v_writelane_b32 v28, s53, 13 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s40 +; SI-NEXT: s_lshr_b32 s30, s5, 16 +; SI-NEXT: s_lshr_b32 s31, s7, 16 +; SI-NEXT: s_lshr_b32 s34, s9, 16 +; SI-NEXT: s_lshr_b32 s35, s11, 16 +; SI-NEXT: s_lshr_b32 s36, s13, 16 +; SI-NEXT: s_lshr_b32 s37, s15, 16 +; SI-NEXT: s_lshr_b32 s38, s17, 16 +; SI-NEXT: s_lshr_b32 s39, s19, 16 +; SI-NEXT: s_lshr_b32 s48, s21, 16 +; SI-NEXT: s_lshr_b32 s49, s23, 16 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s41, 16 +; SI-NEXT: s_lshr_b32 s52, s43, 16 +; SI-NEXT: s_lshr_b32 s53, s45, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[44:45], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 ; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 @@ -8218,281 +7856,201 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_lshr_b32 s5, s41, 16 -; SI-NEXT: s_lshr_b32 s44, s42, 16 -; SI-NEXT: s_lshr_b32 s45, s43, 16 -; SI-NEXT: s_lshr_b32 s46, s24, 16 -; SI-NEXT: s_lshr_b32 s47, s25, 16 -; SI-NEXT: s_lshr_b32 s56, s26, 16 -; SI-NEXT: s_lshr_b32 s57, s27, 16 -; SI-NEXT: s_lshr_b32 s58, s28, 16 -; SI-NEXT: s_lshr_b32 s59, s29, 16 -; SI-NEXT: s_lshr_b32 s60, s23, 16 -; SI-NEXT: s_lshr_b32 s61, s22, 16 -; SI-NEXT: s_lshr_b32 s62, s21, 16 -; SI-NEXT: s_lshr_b32 s63, s20, 16 -; SI-NEXT: s_lshr_b32 s72, s19, 16 -; SI-NEXT: s_lshr_b32 s73, s18, 16 -; SI-NEXT: s_lshr_b32 s74, s17, 16 -; SI-NEXT: s_lshr_b32 s75, s16, 16 -; SI-NEXT: s_lshr_b32 s76, s15, 16 -; SI-NEXT: s_lshr_b32 s77, s14, 16 -; SI-NEXT: s_lshr_b32 s78, s13, 16 -; SI-NEXT: s_lshr_b32 s79, s12, 16 -; SI-NEXT: s_lshr_b32 s88, s11, 16 -; SI-NEXT: s_lshr_b32 s89, s10, 16 -; SI-NEXT: s_lshr_b32 s90, s8, 16 -; SI-NEXT: s_lshr_b32 s91, s7, 16 -; SI-NEXT: s_lshr_b32 s92, s6, 16 -; SI-NEXT: s_lshr_b32 s93, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s25 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v43, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v44, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v46, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 16 +; SI-NEXT: s_lshr_b32 s30, s5, 16 +; SI-NEXT: s_lshr_b32 s31, s7, 16 +; SI-NEXT: s_lshr_b32 s34, s9, 16 +; SI-NEXT: s_lshr_b32 s35, s11, 16 +; SI-NEXT: s_lshr_b32 s36, s13, 16 +; SI-NEXT: s_lshr_b32 s37, s15, 16 +; SI-NEXT: s_lshr_b32 s38, s17, 16 +; SI-NEXT: s_lshr_b32 s39, s19, 16 +; SI-NEXT: s_lshr_b32 s48, s21, 16 +; SI-NEXT: s_lshr_b32 s49, s23, 16 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s41, 16 +; SI-NEXT: s_lshr_b32 s52, s43, 16 +; SI-NEXT: s_lshr_b32 s53, s45, 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[44:45], 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_or_b32_e32 v2, v44, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v44 -; SI-NEXT: v_or_b32_e32 v5, v5, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v7, v7, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v3, v46, v3 -; SI-NEXT: v_or_b32_e32 v4, v43, v4 -; SI-NEXT: v_or_b32_e32 v6, v41, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v40 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v9, v54, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 -; SI-NEXT: v_or_b32_e32 v11, v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_or_b32_e32 v13, v50, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v50 -; SI-NEXT: v_or_b32_e32 v15, v48, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 -; SI-NEXT: v_or_b32_e32 v17, v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 -; SI-NEXT: v_or_b32_e32 v19, v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v36 -; SI-NEXT: v_or_b32_e32 v21, v34, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v34 -; SI-NEXT: v_or_b32_e32 v23, v32, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v32 -; SI-NEXT: v_or_b32_e32 v25, v30, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v30 -; SI-NEXT: v_or_b32_e32 v8, v55, v8 -; SI-NEXT: v_or_b32_e32 v10, v53, v10 -; SI-NEXT: v_or_b32_e32 v12, v51, v12 -; SI-NEXT: v_or_b32_e32 v14, v49, v14 -; SI-NEXT: v_or_b32_e32 v16, v39, v16 -; SI-NEXT: v_or_b32_e32 v18, v37, v18 -; SI-NEXT: v_or_b32_e32 v20, v35, v20 -; SI-NEXT: v_or_b32_e32 v22, v33, v22 -; SI-NEXT: v_or_b32_e32 v24, v31, v24 -; SI-NEXT: v_or_b32_e32 v26, v29, v26 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: s_lshl_b32 s27, s92, 16 +; SI-NEXT: s_and_b32 s29, s44, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s45, 0xffff +; SI-NEXT: s_lshl_b32 s44, s53, 16 +; SI-NEXT: s_or_b32 s29, s29, s44 +; SI-NEXT: s_lshl_b32 s44, s90, 16 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s42, s42, s44 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s44, s52, 16 +; SI-NEXT: s_or_b32 s43, s43, s44 +; SI-NEXT: s_lshl_b32 s44, s88, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s44 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s44, s51, 16 +; SI-NEXT: s_or_b32 s41, s41, s44 +; SI-NEXT: s_lshl_b32 s44, s78, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s44 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s44, s50, 16 +; SI-NEXT: s_or_b32 s25, s25, s44 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s44, s76, 16 +; SI-NEXT: s_or_b32 s22, s22, s44 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s44, s49, 16 +; SI-NEXT: s_or_b32 s23, s23, s44 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s44, s74, 16 +; SI-NEXT: s_or_b32 s20, s20, s44 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s44, s48, 16 +; SI-NEXT: s_or_b32 s21, s21, s44 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s44, s72, 16 +; SI-NEXT: s_or_b32 s18, s18, s44 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s44, s39, 16 +; SI-NEXT: s_or_b32 s19, s19, s44 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s44, s62, 16 +; SI-NEXT: s_or_b32 s16, s16, s44 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s38, 16 +; SI-NEXT: s_or_b32 s17, s17, s44 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s44, s60, 16 +; SI-NEXT: s_or_b32 s14, s14, s44 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s44, s37, 16 +; SI-NEXT: s_or_b32 s15, s15, s44 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s44, s58, 16 +; SI-NEXT: s_or_b32 s12, s12, s44 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s44, s36, 16 +; SI-NEXT: s_or_b32 s13, s13, s44 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s44, s56, 16 +; SI-NEXT: s_or_b32 s10, s10, s44 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s44, s35, 16 +; SI-NEXT: s_or_b32 s11, s11, s44 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s44, s46, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s44 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s44, s34, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s31, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s30, 16 +; SI-NEXT: s_or_b32 s9, s9, s44 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s42 +; SI-NEXT: v_mov_b32_e32 v3, s43 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s24 +; SI-NEXT: v_mov_b32_e32 v7, s25 +; SI-NEXT: v_mov_b32_e32 v8, s22 +; SI-NEXT: v_mov_b32_e32 v9, s23 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s21 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s19 +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: v_mov_b32_e32 v15, s17 +; SI-NEXT: v_mov_b32_e32 v16, s14 +; SI-NEXT: v_mov_b32_e32 v17, s15 +; SI-NEXT: v_mov_b32_e32 v18, s12 +; SI-NEXT: v_mov_b32_e32 v19, s13 +; SI-NEXT: v_mov_b32_e32 v20, s10 +; SI-NEXT: v_mov_b32_e32 v21, s11 +; SI-NEXT: v_mov_b32_e32 v22, s8 +; SI-NEXT: v_mov_b32_e32 v23, s9 +; SI-NEXT: v_mov_b32_e32 v24, s6 +; SI-NEXT: v_mov_b32_e32 v25, s7 +; SI-NEXT: v_mov_b32_e32 v26, s4 +; SI-NEXT: v_mov_b32_e32 v27, s5 +; SI-NEXT: v_readlane_b32 s53, v28, 13 +; SI-NEXT: v_readlane_b32 s52, v28, 12 +; SI-NEXT: v_readlane_b32 s51, v28, 11 +; SI-NEXT: v_readlane_b32 s50, v28, 10 +; SI-NEXT: v_readlane_b32 s49, v28, 9 +; SI-NEXT: v_readlane_b32 s48, v28, 8 +; SI-NEXT: v_readlane_b32 s39, v28, 7 +; SI-NEXT: v_readlane_b32 s38, v28, 6 +; SI-NEXT: v_readlane_b32 s37, v28, 5 +; SI-NEXT: v_readlane_b32 s36, v28, 4 +; SI-NEXT: v_readlane_b32 s35, v28, 3 +; SI-NEXT: v_readlane_b32 s34, v28, 2 +; SI-NEXT: v_readlane_b32 s31, v28, 1 +; SI-NEXT: v_readlane_b32 s30, v28, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v28i32_to_v56f16_scalar: @@ -9258,205 +8816,264 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v48, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v40, v11 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v44, v7 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: v_mov_b32_e32 v46, v5 +; SI-NEXT: v_mov_b32_e32 v47, v4 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_mov_b32_e32 v57, v2 +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v59 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v55 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -9500,145 +9117,20 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 -; SI-NEXT: v_or_b32_e32 v2, v42, v2 -; SI-NEXT: v_or_b32_e32 v3, v40, v3 -; SI-NEXT: v_or_b32_e32 v4, v54, v4 -; SI-NEXT: v_or_b32_e32 v5, v52, v5 -; SI-NEXT: v_or_b32_e32 v6, v50, v6 -; SI-NEXT: v_or_b32_e32 v21, v60, v21 -; SI-NEXT: v_or_b32_e32 v22, v58, v22 -; SI-NEXT: v_or_b32_e32 v23, v48, v23 -; SI-NEXT: v_or_b32_e32 v24, v38, v24 -; SI-NEXT: v_or_b32_e32 v25, v36, v25 -; SI-NEXT: v_or_b32_e32 v26, v34, v26 -; SI-NEXT: v_or_b32_e32 v27, v32, v27 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v63 -; SI-NEXT: v_or_b32_e32 v20, v62, v20 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: .LBB18_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -9651,10 +9143,10 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -9663,167 +9155,152 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v40 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -9831,78 +9308,92 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v63 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 @@ -10691,664 +10182,412 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-LABEL: bitcast_v56f16_to_v28i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s17 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s8 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v31 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_mov_b32_e32 v32, v13 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_mov_b32_e32 v34, v11 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: v_mov_b32_e32 v36, v9 +; SI-NEXT: v_mov_b32_e32 v37, v8 +; SI-NEXT: v_mov_b32_e32 v38, v7 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v49, v4 +; SI-NEXT: v_mov_b32_e32 v50, v3 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v1 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v37 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v53 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v58 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v48, v63 -; SI-NEXT: v_or_b32_e32 v5, v63, v5 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v63, v62 -; SI-NEXT: v_or_b32_e32 v6, v62, v6 -; SI-NEXT: v_or_b32_e32 v7, v59, v7 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v59, v47 -; SI-NEXT: v_or_b32_e32 v8, v47, v8 -; SI-NEXT: v_mov_b32_e32 v56, v58 -; SI-NEXT: v_or_b32_e32 v11, v46, v11 -; SI-NEXT: v_or_b32_e32 v12, v45, v12 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: v_or_b32_e32 v14, v41, v14 -; SI-NEXT: v_or_b32_e32 v15, v31, v15 -; SI-NEXT: v_or_b32_e32 v16, v28, v16 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v57 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v57, v27 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v44 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v38 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v37 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v54 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v57 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v56 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v46 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v39 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v36 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v55 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v33 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: .LBB19_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v57, v43 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v46, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v41 -; SI-NEXT: v_mov_b32_e32 v43, v31 -; SI-NEXT: v_mov_b32_e32 v42, v30 -; SI-NEXT: v_mov_b32_e32 v41, v28 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v28, v41 -; SI-NEXT: v_mov_b32_e32 v30, v42 -; SI-NEXT: v_mov_b32_e32 v41, v45 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v48, v63 -; SI-NEXT: v_mov_b32_e32 v63, v62 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v59, v47 -; SI-NEXT: v_mov_b32_e32 v56, v58 -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v43, v57 ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v56f16_to_v28i32_scalar: @@ -17987,555 +17226,245 @@ end: define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v28f32_to_v56f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v38, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v48, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v38, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v48, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_mov_b32_e32 v46, v24 -; SI-NEXT: v_mov_b32_e32 v45, v25 -; SI-NEXT: v_mov_b32_e32 v43, v26 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v46 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v44 +; SI-NEXT: v_or_b32_e32 v8, v8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v12, v12, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v47 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v42 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v31 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v61 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v44 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v58 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v46 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v43 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v14, v14, v34 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v20, v20, v31 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v50 +; SI-NEXT: v_or_b32_e32 v22, v22, v30 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v24, v24, v29 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v39 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v48 +; SI-NEXT: v_or_b32_e32 v9, v9, v38 +; SI-NEXT: v_or_b32_e32 v11, v11, v36 +; SI-NEXT: v_or_b32_e32 v13, v13, v35 +; SI-NEXT: v_or_b32_e32 v15, v15, v34 +; SI-NEXT: v_or_b32_e32 v17, v17, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v21, v21, v31 +; SI-NEXT: v_or_b32_e32 v23, v23, v30 +; SI-NEXT: v_or_b32_e32 v25, v25, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -19107,21 +18036,21 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_mov_b32_e32 v21, s16 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: v_mov_b32_e32 v29, s18 -; SI-NEXT: v_mov_b32_e32 v22, s19 -; SI-NEXT: v_mov_b32_e32 v28, s20 -; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v24, s16 +; SI-NEXT: v_mov_b32_e32 v25, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v26, s20 +; SI-NEXT: v_mov_b32_e32 v27, s21 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v23, s23 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_mov_b32_e32 v16, s26 +; SI-NEXT: v_mov_b32_e32 v17, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v33, s22 -; SI-NEXT: v_mov_b32_e32 v32, s23 -; SI-NEXT: v_mov_b32_e32 v14, s24 -; SI-NEXT: v_mov_b32_e32 v16, s25 -; SI-NEXT: v_mov_b32_e32 v19, s26 -; SI-NEXT: v_mov_b32_e32 v18, s27 -; SI-NEXT: v_mov_b32_e32 v31, s28 -; SI-NEXT: v_mov_b32_e32 v23, s29 +; SI-NEXT: v_mov_b32_e32 v14, s28 +; SI-NEXT: v_mov_b32_e32 v15, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -19140,338 +18069,185 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v14 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v15 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v29 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v10 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v5 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v1 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[50:51], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshr_b64 v[54:55], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[2:3], 16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v15 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v27 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v21 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v25 +; SI-NEXT: v_mov_b32_e32 v55, v28 +; SI-NEXT: v_lshr_b64 v[41:42], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[50:51], v[12:13], 16 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[51:52], v[10:11], 16 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[52:53], v[8:9], 16 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v28 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_add_f32_e32 v27, 1.0, v32 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v31 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[53:54], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[24:25], 16 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v15 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v27 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v21 -; SI-NEXT: v_mov_b32_e32 v36, v12 -; SI-NEXT: v_mov_b32_e32 v34, v13 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v25 ; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v28, v24, v28 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v43 +; SI-NEXT: v_or_b32_e32 v29, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v30, v20, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v63 +; SI-NEXT: v_or_b32_e32 v31, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v32 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v32, v21, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v62 +; SI-NEXT: v_or_b32_e32 v33, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v34 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v34, v21, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 +; SI-NEXT: v_or_b32_e32 v35, v20, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 +; SI-NEXT: v_or_b32_e32 v36, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v60 +; SI-NEXT: v_or_b32_e32 v37, v18, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v38 +; SI-NEXT: v_or_b32_e32 v38, v16, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v59 +; SI-NEXT: v_or_b32_e32 v39, v16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_or_b32_e32 v48, v14, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v58 +; SI-NEXT: v_or_b32_e32 v49, v14, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v46 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v26 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v60 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v53 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v45 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v37 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v41 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v48 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -19488,167 +18264,59 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v34 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v38 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v50 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v28 +; SI-NEXT: v_mov_b32_e32 v1, v29 +; SI-NEXT: v_mov_b32_e32 v2, v30 +; SI-NEXT: v_mov_b32_e32 v3, v31 +; SI-NEXT: v_mov_b32_e32 v4, v32 +; SI-NEXT: v_mov_b32_e32 v5, v33 +; SI-NEXT: v_mov_b32_e32 v6, v34 +; SI-NEXT: v_mov_b32_e32 v7, v35 +; SI-NEXT: v_mov_b32_e32 v8, v36 +; SI-NEXT: v_mov_b32_e32 v9, v37 +; SI-NEXT: v_mov_b32_e32 v10, v38 +; SI-NEXT: v_mov_b32_e32 v11, v39 +; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_mov_b32_e32 v13, v49 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v54 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v36 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; kill: killed $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v28f32_to_v56f16_scalar: @@ -20500,205 +19168,264 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v48, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v40, v11 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v44, v7 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: v_mov_b32_e32 v46, v5 +; SI-NEXT: v_mov_b32_e32 v47, v4 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_mov_b32_e32 v57, v2 +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v59 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v55 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -20742,145 +19469,20 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 -; SI-NEXT: v_or_b32_e32 v2, v42, v2 -; SI-NEXT: v_or_b32_e32 v3, v40, v3 -; SI-NEXT: v_or_b32_e32 v4, v54, v4 -; SI-NEXT: v_or_b32_e32 v5, v52, v5 -; SI-NEXT: v_or_b32_e32 v6, v50, v6 -; SI-NEXT: v_or_b32_e32 v21, v60, v21 -; SI-NEXT: v_or_b32_e32 v22, v58, v22 -; SI-NEXT: v_or_b32_e32 v23, v48, v23 -; SI-NEXT: v_or_b32_e32 v24, v38, v24 -; SI-NEXT: v_or_b32_e32 v25, v36, v25 -; SI-NEXT: v_or_b32_e32 v26, v34, v26 -; SI-NEXT: v_or_b32_e32 v27, v32, v27 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v63 -; SI-NEXT: v_or_b32_e32 v20, v62, v20 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: .LBB34_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -20893,10 +19495,10 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -20905,167 +19507,152 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v40 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -21073,78 +19660,92 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v63 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 @@ -21933,664 +20534,412 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-LABEL: bitcast_v56f16_to_v28f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s17 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s8 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v31 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_mov_b32_e32 v32, v13 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_mov_b32_e32 v34, v11 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: v_mov_b32_e32 v36, v9 +; SI-NEXT: v_mov_b32_e32 v37, v8 +; SI-NEXT: v_mov_b32_e32 v38, v7 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v49, v4 +; SI-NEXT: v_mov_b32_e32 v50, v3 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v1 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v37 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v53 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v58 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v48, v63 -; SI-NEXT: v_or_b32_e32 v5, v63, v5 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v63, v62 -; SI-NEXT: v_or_b32_e32 v6, v62, v6 -; SI-NEXT: v_or_b32_e32 v7, v59, v7 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v59, v47 -; SI-NEXT: v_or_b32_e32 v8, v47, v8 -; SI-NEXT: v_mov_b32_e32 v56, v58 -; SI-NEXT: v_or_b32_e32 v11, v46, v11 -; SI-NEXT: v_or_b32_e32 v12, v45, v12 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: v_or_b32_e32 v14, v41, v14 -; SI-NEXT: v_or_b32_e32 v15, v31, v15 -; SI-NEXT: v_or_b32_e32 v16, v28, v16 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v57 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v57, v27 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v44 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v38 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v37 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v54 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v57 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v56 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v46 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v39 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v36 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v55 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v33 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: .LBB35_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v57, v43 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v46, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v41 -; SI-NEXT: v_mov_b32_e32 v43, v31 -; SI-NEXT: v_mov_b32_e32 v42, v30 -; SI-NEXT: v_mov_b32_e32 v41, v28 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v28, v41 -; SI-NEXT: v_mov_b32_e32 v30, v42 -; SI-NEXT: v_mov_b32_e32 v41, v45 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v48, v63 -; SI-NEXT: v_mov_b32_e32 v63, v62 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v59, v47 -; SI-NEXT: v_mov_b32_e32 v56, v58 -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v43, v57 ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v56f16_to_v28f32_scalar: @@ -28326,242 +26675,83 @@ end: define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v14i64_to_v56f16: ; SI: ; %bb.0: -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v39, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v50, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 @@ -28574,7 +26764,6 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 @@ -28588,294 +26777,143 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 ; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v39, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v50, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_mov_b32_e32 v46, v24 -; SI-NEXT: v_mov_b32_e32 v45, v25 -; SI-NEXT: v_mov_b32_e32 v43, v26 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v46 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v44 +; SI-NEXT: v_or_b32_e32 v8, v8, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v12, v12, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v37 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v47 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v42 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v31 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v61 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v44 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v58 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v46 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v43 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v14, v14, v34 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v20, v20, v31 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v22, v22, v30 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v24, v24, v29 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 +; SI-NEXT: v_or_b32_e32 v7, v7, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v36 +; SI-NEXT: v_or_b32_e32 v13, v13, v35 +; SI-NEXT: v_or_b32_e32 v15, v15, v34 +; SI-NEXT: v_or_b32_e32 v17, v17, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v21, v21, v31 +; SI-NEXT: v_or_b32_e32 v23, v23, v30 +; SI-NEXT: v_or_b32_e32 v25, v25, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -29488,449 +27526,317 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-LABEL: bitcast_v14i64_to_v56f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v28, s30, 0 +; SI-NEXT: v_writelane_b32 v28, s31, 1 +; SI-NEXT: v_writelane_b32 v28, s34, 2 +; SI-NEXT: v_writelane_b32 v28, s35, 3 +; SI-NEXT: v_writelane_b32 v28, s36, 4 +; SI-NEXT: v_writelane_b32 v28, s37, 5 +; SI-NEXT: v_writelane_b32 v28, s38, 6 +; SI-NEXT: v_writelane_b32 v28, s39, 7 +; SI-NEXT: v_writelane_b32 v28, s48, 8 +; SI-NEXT: v_writelane_b32 v28, s49, 9 ; SI-NEXT: v_mov_b32_e32 v15, s16 ; SI-NEXT: v_mov_b32_e32 v16, s17 ; SI-NEXT: v_mov_b32_e32 v17, s18 ; SI-NEXT: v_mov_b32_e32 v18, s19 +; SI-NEXT: v_writelane_b32 v28, s50, 10 ; SI-NEXT: v_mov_b32_e32 v19, s20 -; SI-NEXT: v_readfirstlane_b32 s40, v15 +; SI-NEXT: v_readfirstlane_b32 s44, v15 ; SI-NEXT: v_mov_b32_e32 v15, s21 -; SI-NEXT: v_readfirstlane_b32 s42, v16 +; SI-NEXT: v_readfirstlane_b32 s45, v16 ; SI-NEXT: v_mov_b32_e32 v16, s22 -; SI-NEXT: v_readfirstlane_b32 s41, v17 +; SI-NEXT: v_readfirstlane_b32 s42, v17 ; SI-NEXT: v_mov_b32_e32 v17, s23 ; SI-NEXT: v_readfirstlane_b32 s43, v18 ; SI-NEXT: v_mov_b32_e32 v18, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v19 +; SI-NEXT: v_writelane_b32 v28, s51, 11 +; SI-NEXT: v_readfirstlane_b32 s40, v19 ; SI-NEXT: v_mov_b32_e32 v19, s25 -; SI-NEXT: v_readfirstlane_b32 s44, v15 +; SI-NEXT: v_readfirstlane_b32 s41, v15 ; SI-NEXT: v_mov_b32_e32 v15, s26 -; SI-NEXT: v_readfirstlane_b32 s25, v16 +; SI-NEXT: v_readfirstlane_b32 s24, v16 ; SI-NEXT: v_mov_b32_e32 v16, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v17 +; SI-NEXT: v_readfirstlane_b32 s25, v17 ; SI-NEXT: v_mov_b32_e32 v17, s28 -; SI-NEXT: v_readfirstlane_b32 s26, v18 +; SI-NEXT: v_readfirstlane_b32 s22, v18 ; SI-NEXT: v_mov_b32_e32 v18, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_readfirstlane_b32 s28, v19 -; SI-NEXT: v_readfirstlane_b32 s22, v15 -; SI-NEXT: v_readfirstlane_b32 s23, v16 -; SI-NEXT: v_readfirstlane_b32 s20, v17 -; SI-NEXT: v_readfirstlane_b32 s21, v18 -; SI-NEXT: v_readfirstlane_b32 s18, v0 -; SI-NEXT: v_readfirstlane_b32 s19, v1 -; SI-NEXT: v_readfirstlane_b32 s16, v2 -; SI-NEXT: v_readfirstlane_b32 s17, v3 -; SI-NEXT: v_readfirstlane_b32 s14, v4 -; SI-NEXT: v_readfirstlane_b32 s15, v5 -; SI-NEXT: v_readfirstlane_b32 s12, v6 -; SI-NEXT: v_readfirstlane_b32 s13, v7 -; SI-NEXT: v_readfirstlane_b32 s10, v8 -; SI-NEXT: v_readfirstlane_b32 s11, v9 -; SI-NEXT: v_readfirstlane_b32 s7, v10 -; SI-NEXT: v_readfirstlane_b32 s8, v11 -; SI-NEXT: v_readfirstlane_b32 s6, v12 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v13 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v28, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s23, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v15 +; SI-NEXT: v_readfirstlane_b32 s21, v16 +; SI-NEXT: v_readfirstlane_b32 s18, v17 +; SI-NEXT: v_readfirstlane_b32 s19, v18 +; SI-NEXT: v_readfirstlane_b32 s16, v0 +; SI-NEXT: v_readfirstlane_b32 s17, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v2 +; SI-NEXT: v_readfirstlane_b32 s15, v3 +; SI-NEXT: v_readfirstlane_b32 s12, v4 +; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: v_readfirstlane_b32 s10, v6 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v8 +; SI-NEXT: v_readfirstlane_b32 s9, v9 +; SI-NEXT: v_readfirstlane_b32 s6, v10 +; SI-NEXT: v_readfirstlane_b32 s7, v11 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v13 +; SI-NEXT: v_writelane_b32 v28, s53, 13 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: s_lshr_b32 s4, s44, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s40 +; SI-NEXT: s_lshr_b32 s30, s5, 16 +; SI-NEXT: s_lshr_b32 s31, s7, 16 +; SI-NEXT: s_lshr_b32 s34, s9, 16 +; SI-NEXT: s_lshr_b32 s35, s11, 16 +; SI-NEXT: s_lshr_b32 s36, s13, 16 +; SI-NEXT: s_lshr_b32 s37, s15, 16 +; SI-NEXT: s_lshr_b32 s38, s17, 16 +; SI-NEXT: s_lshr_b32 s39, s19, 16 +; SI-NEXT: s_lshr_b32 s48, s21, 16 +; SI-NEXT: s_lshr_b32 s49, s23, 16 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s41, 16 +; SI-NEXT: s_lshr_b32 s52, s43, 16 +; SI-NEXT: s_lshr_b32 s53, s45, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[44:45], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s40, 3 -; SI-NEXT: s_addc_u32 s5, s42, 0 -; SI-NEXT: s_lshr_b32 s29, s4, 16 -; SI-NEXT: s_lshr_b32 s40, s5, 16 -; SI-NEXT: s_add_u32 s41, s41, 3 -; SI-NEXT: s_addc_u32 s42, s43, 0 -; SI-NEXT: s_lshr_b32 s43, s41, 16 -; SI-NEXT: s_lshr_b32 s45, s42, 16 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s44, s44, 0 -; SI-NEXT: s_lshr_b32 s46, s24, 16 -; SI-NEXT: s_lshr_b32 s47, s44, 16 -; SI-NEXT: s_add_u32 s25, s25, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s56, s25, 16 -; SI-NEXT: s_lshr_b32 s57, s27, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s28, s28, 0 -; SI-NEXT: s_lshr_b32 s58, s26, 16 -; SI-NEXT: s_lshr_b32 s59, s28, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s60, s22, 16 -; SI-NEXT: s_lshr_b32 s61, s23, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s62, s20, 16 -; SI-NEXT: s_lshr_b32 s63, s21, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s72, s18, 16 -; SI-NEXT: s_lshr_b32 s73, s19, 16 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s74, s16, 16 -; SI-NEXT: s_lshr_b32 s75, s17, 16 -; SI-NEXT: s_add_u32 s14, s14, 3 -; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_lshr_b32 s76, s14, 16 -; SI-NEXT: s_lshr_b32 s77, s15, 16 -; SI-NEXT: s_add_u32 s12, s12, 3 -; SI-NEXT: s_addc_u32 s13, s13, 0 -; SI-NEXT: s_lshr_b32 s78, s12, 16 -; SI-NEXT: s_lshr_b32 s79, s13, 16 -; SI-NEXT: s_add_u32 s10, s10, 3 -; SI-NEXT: s_addc_u32 s11, s11, 0 -; SI-NEXT: s_lshr_b32 s88, s10, 16 -; SI-NEXT: s_lshr_b32 s89, s11, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_lshr_b32 s90, s7, 16 -; SI-NEXT: s_lshr_b32 s91, s8, 16 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_lshr_b32 s92, s6, 16 -; SI-NEXT: s_lshr_b32 s93, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s44 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v43, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v44, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s40 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v47, s29 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 +; SI-NEXT: s_add_u32 s44, s44, 3 +; SI-NEXT: s_addc_u32 s45, s45, 0 +; SI-NEXT: s_lshr_b32 s30, s5, 16 +; SI-NEXT: s_lshr_b32 s31, s7, 16 +; SI-NEXT: s_lshr_b32 s34, s9, 16 +; SI-NEXT: s_lshr_b32 s35, s11, 16 +; SI-NEXT: s_lshr_b32 s36, s13, 16 +; SI-NEXT: s_lshr_b32 s37, s15, 16 +; SI-NEXT: s_lshr_b32 s38, s17, 16 +; SI-NEXT: s_lshr_b32 s39, s19, 16 +; SI-NEXT: s_lshr_b32 s48, s21, 16 +; SI-NEXT: s_lshr_b32 s49, s23, 16 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s41, 16 +; SI-NEXT: s_lshr_b32 s52, s43, 16 +; SI-NEXT: s_lshr_b32 s53, s45, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[44:45], 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: v_or_b32_e32 v2, v44, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v44 -; SI-NEXT: v_or_b32_e32 v5, v5, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v7, v7, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v3, v46, v3 -; SI-NEXT: v_or_b32_e32 v4, v43, v4 -; SI-NEXT: v_or_b32_e32 v6, v41, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v40 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v9, v54, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 -; SI-NEXT: v_or_b32_e32 v11, v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_or_b32_e32 v13, v50, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v50 -; SI-NEXT: v_or_b32_e32 v15, v48, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 -; SI-NEXT: v_or_b32_e32 v17, v38, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 -; SI-NEXT: v_or_b32_e32 v19, v36, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v36 -; SI-NEXT: v_or_b32_e32 v21, v34, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v34 -; SI-NEXT: v_or_b32_e32 v23, v32, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v32 -; SI-NEXT: v_or_b32_e32 v25, v30, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v30 -; SI-NEXT: v_or_b32_e32 v8, v55, v8 -; SI-NEXT: v_or_b32_e32 v10, v53, v10 -; SI-NEXT: v_or_b32_e32 v12, v51, v12 -; SI-NEXT: v_or_b32_e32 v14, v49, v14 -; SI-NEXT: v_or_b32_e32 v16, v39, v16 -; SI-NEXT: v_or_b32_e32 v18, v37, v18 -; SI-NEXT: v_or_b32_e32 v20, v35, v20 -; SI-NEXT: v_or_b32_e32 v22, v33, v22 -; SI-NEXT: v_or_b32_e32 v24, v31, v24 -; SI-NEXT: v_or_b32_e32 v26, v29, v26 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: s_lshl_b32 s27, s92, 16 +; SI-NEXT: s_and_b32 s29, s44, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s45, 0xffff +; SI-NEXT: s_lshl_b32 s44, s53, 16 +; SI-NEXT: s_or_b32 s29, s29, s44 +; SI-NEXT: s_lshl_b32 s44, s90, 16 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s42, s42, s44 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s44, s52, 16 +; SI-NEXT: s_or_b32 s43, s43, s44 +; SI-NEXT: s_lshl_b32 s44, s88, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s44 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s44, s51, 16 +; SI-NEXT: s_or_b32 s41, s41, s44 +; SI-NEXT: s_lshl_b32 s44, s78, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s44 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s44, s50, 16 +; SI-NEXT: s_or_b32 s25, s25, s44 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s44, s76, 16 +; SI-NEXT: s_or_b32 s22, s22, s44 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s44, s49, 16 +; SI-NEXT: s_or_b32 s23, s23, s44 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s44, s74, 16 +; SI-NEXT: s_or_b32 s20, s20, s44 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s44, s48, 16 +; SI-NEXT: s_or_b32 s21, s21, s44 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s44, s72, 16 +; SI-NEXT: s_or_b32 s18, s18, s44 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s44, s39, 16 +; SI-NEXT: s_or_b32 s19, s19, s44 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s44, s62, 16 +; SI-NEXT: s_or_b32 s16, s16, s44 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s38, 16 +; SI-NEXT: s_or_b32 s17, s17, s44 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s44, s60, 16 +; SI-NEXT: s_or_b32 s14, s14, s44 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s44, s37, 16 +; SI-NEXT: s_or_b32 s15, s15, s44 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s44, s58, 16 +; SI-NEXT: s_or_b32 s12, s12, s44 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s44, s36, 16 +; SI-NEXT: s_or_b32 s13, s13, s44 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s44, s56, 16 +; SI-NEXT: s_or_b32 s10, s10, s44 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s44, s35, 16 +; SI-NEXT: s_or_b32 s11, s11, s44 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s44, s46, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s44 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s44, s34, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s31, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s30, 16 +; SI-NEXT: s_or_b32 s9, s9, s44 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s42 +; SI-NEXT: v_mov_b32_e32 v3, s43 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s41 +; SI-NEXT: v_mov_b32_e32 v6, s24 +; SI-NEXT: v_mov_b32_e32 v7, s25 +; SI-NEXT: v_mov_b32_e32 v8, s22 +; SI-NEXT: v_mov_b32_e32 v9, s23 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s21 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s19 +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: v_mov_b32_e32 v15, s17 +; SI-NEXT: v_mov_b32_e32 v16, s14 +; SI-NEXT: v_mov_b32_e32 v17, s15 +; SI-NEXT: v_mov_b32_e32 v18, s12 +; SI-NEXT: v_mov_b32_e32 v19, s13 +; SI-NEXT: v_mov_b32_e32 v20, s10 +; SI-NEXT: v_mov_b32_e32 v21, s11 +; SI-NEXT: v_mov_b32_e32 v22, s8 +; SI-NEXT: v_mov_b32_e32 v23, s9 +; SI-NEXT: v_mov_b32_e32 v24, s6 +; SI-NEXT: v_mov_b32_e32 v25, s7 +; SI-NEXT: v_mov_b32_e32 v26, s4 +; SI-NEXT: v_mov_b32_e32 v27, s5 +; SI-NEXT: v_readlane_b32 s53, v28, 13 +; SI-NEXT: v_readlane_b32 s52, v28, 12 +; SI-NEXT: v_readlane_b32 s51, v28, 11 +; SI-NEXT: v_readlane_b32 s50, v28, 10 +; SI-NEXT: v_readlane_b32 s49, v28, 9 +; SI-NEXT: v_readlane_b32 s48, v28, 8 +; SI-NEXT: v_readlane_b32 s39, v28, 7 +; SI-NEXT: v_readlane_b32 s38, v28, 6 +; SI-NEXT: v_readlane_b32 s37, v28, 5 +; SI-NEXT: v_readlane_b32 s36, v28, 4 +; SI-NEXT: v_readlane_b32 s35, v28, 3 +; SI-NEXT: v_readlane_b32 s34, v28, 2 +; SI-NEXT: v_readlane_b32 s31, v28, 1 +; SI-NEXT: v_readlane_b32 s30, v28, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v14i64_to_v56f16_scalar: @@ -30696,205 +28602,264 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v48, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v40, v11 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v44, v7 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: v_mov_b32_e32 v46, v5 +; SI-NEXT: v_mov_b32_e32 v47, v4 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_mov_b32_e32 v57, v2 +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v59 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v55 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -30938,145 +28903,20 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 -; SI-NEXT: v_or_b32_e32 v2, v42, v2 -; SI-NEXT: v_or_b32_e32 v3, v40, v3 -; SI-NEXT: v_or_b32_e32 v4, v54, v4 -; SI-NEXT: v_or_b32_e32 v5, v52, v5 -; SI-NEXT: v_or_b32_e32 v6, v50, v6 -; SI-NEXT: v_or_b32_e32 v21, v60, v21 -; SI-NEXT: v_or_b32_e32 v22, v58, v22 -; SI-NEXT: v_or_b32_e32 v23, v48, v23 -; SI-NEXT: v_or_b32_e32 v24, v38, v24 -; SI-NEXT: v_or_b32_e32 v25, v36, v25 -; SI-NEXT: v_or_b32_e32 v26, v34, v26 -; SI-NEXT: v_or_b32_e32 v27, v32, v27 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v63 -; SI-NEXT: v_or_b32_e32 v20, v62, v20 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -31089,10 +28929,10 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -31101,167 +28941,152 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v40 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -31269,78 +29094,92 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v63 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 @@ -32129,664 +29968,412 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-LABEL: bitcast_v56f16_to_v14i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s17 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s8 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v31 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_mov_b32_e32 v32, v13 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_mov_b32_e32 v34, v11 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: v_mov_b32_e32 v36, v9 +; SI-NEXT: v_mov_b32_e32 v37, v8 +; SI-NEXT: v_mov_b32_e32 v38, v7 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v49, v4 +; SI-NEXT: v_mov_b32_e32 v50, v3 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v1 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v37 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v53 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v58 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v48, v63 -; SI-NEXT: v_or_b32_e32 v5, v63, v5 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v63, v62 -; SI-NEXT: v_or_b32_e32 v6, v62, v6 -; SI-NEXT: v_or_b32_e32 v7, v59, v7 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v59, v47 -; SI-NEXT: v_or_b32_e32 v8, v47, v8 -; SI-NEXT: v_mov_b32_e32 v56, v58 -; SI-NEXT: v_or_b32_e32 v11, v46, v11 -; SI-NEXT: v_or_b32_e32 v12, v45, v12 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: v_or_b32_e32 v14, v41, v14 -; SI-NEXT: v_or_b32_e32 v15, v31, v15 -; SI-NEXT: v_or_b32_e32 v16, v28, v16 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v57 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v57, v27 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v44 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v38 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v37 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v54 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v57 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v56 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v46 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v39 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v36 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v55 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v33 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v57, v43 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v46, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v41 -; SI-NEXT: v_mov_b32_e32 v43, v31 -; SI-NEXT: v_mov_b32_e32 v42, v30 -; SI-NEXT: v_mov_b32_e32 v41, v28 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v28, v41 -; SI-NEXT: v_mov_b32_e32 v30, v42 -; SI-NEXT: v_mov_b32_e32 v41, v45 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v48, v63 -; SI-NEXT: v_mov_b32_e32 v63, v62 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v59, v47 -; SI-NEXT: v_mov_b32_e32 v56, v58 -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v43, v57 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v56f16_to_v14i64_scalar: @@ -37601,523 +35188,229 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v39, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v50, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v28 -; SI-NEXT: v_mov_b32_e32 v28, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v26 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; SI-NEXT: v_add_f64 v[53:54], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_alignbit_b32 v28, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v29, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v31, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v39, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v50, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v0 -; SI-NEXT: v_mov_b32_e32 v42, v26 -; SI-NEXT: v_mov_b32_e32 v40, v27 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v46 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v45 +; SI-NEXT: v_or_b32_e32 v6, v6, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v44 +; SI-NEXT: v_or_b32_e32 v8, v8, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v12, v12, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v50 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v37 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v32 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v35 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v63 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v56 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v44 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v31 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v43 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v47 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v59 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v40 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v14, v14, v34 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v20, v20, v31 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v51 +; SI-NEXT: v_or_b32_e32 v22, v22, v30 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v24, v24, v29 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 +; SI-NEXT: v_or_b32_e32 v7, v7, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v36 +; SI-NEXT: v_or_b32_e32 v13, v13, v35 +; SI-NEXT: v_or_b32_e32 v15, v15, v34 +; SI-NEXT: v_or_b32_e32 v17, v17, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v21, v21, v31 +; SI-NEXT: v_or_b32_e32 v23, v23, v30 +; SI-NEXT: v_or_b32_e32 v25, v25, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -38694,411 +35987,166 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v35, v9 +; SI-NEXT: v_lshr_b64 v[50:51], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[8:9], 16 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v28 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshr_b64 v[53:54], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[2:3], 16 ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v15 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v27 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v25 +; SI-NEXT: v_lshr_b64 v[54:55], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[20:21], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v1 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 +; SI-NEXT: v_lshr_b64 v[28:29], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_lshr_b64 v[50:51], v[12:13], 16 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_lshr_b64 v[51:52], v[10:11], 16 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 ; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; SI-NEXT: v_add_f64 v[53:54], v[20:21], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 -; SI-NEXT: v_mov_b32_e32 v40, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_lshr_b64 v[52:53], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshr_b64 v[53:54], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[2:3], 16 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[54:55], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[20:21], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v20 -; SI-NEXT: v_mov_b32_e32 v29, v13 -; SI-NEXT: v_mov_b32_e32 v46, v10 -; SI-NEXT: v_mov_b32_e32 v44, v11 -; SI-NEXT: v_mov_b32_e32 v42, v12 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[28:29], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v25 ; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v28, v24, v28 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v43 +; SI-NEXT: v_or_b32_e32 v29, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v30, v20, v24 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v63 +; SI-NEXT: v_or_b32_e32 v31, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v32 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v32, v21, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v62 +; SI-NEXT: v_or_b32_e32 v33, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v34 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v34, v21, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 +; SI-NEXT: v_or_b32_e32 v35, v20, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 +; SI-NEXT: v_or_b32_e32 v36, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v60 +; SI-NEXT: v_or_b32_e32 v37, v18, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v38 +; SI-NEXT: v_or_b32_e32 v38, v16, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v59 +; SI-NEXT: v_or_b32_e32 v39, v16, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_or_b32_e32 v48, v14, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v58 +; SI-NEXT: v_or_b32_e32 v49, v14, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_or_b32_e32 v14, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v52 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v50 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v38 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v35 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v57 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v43 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v56 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v60 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v63 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v46 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -39115,90 +36163,61 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v31 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_mov_b32_e32 v2, v30 +; SI-NEXT: v_mov_b32_e32 v3, v31 +; SI-NEXT: v_mov_b32_e32 v4, v32 +; SI-NEXT: v_mov_b32_e32 v5, v33 +; SI-NEXT: v_mov_b32_e32 v6, v34 +; SI-NEXT: v_mov_b32_e32 v7, v35 +; SI-NEXT: v_mov_b32_e32 v8, v36 +; SI-NEXT: v_mov_b32_e32 v9, v37 +; SI-NEXT: v_mov_b32_e32 v10, v38 +; SI-NEXT: v_mov_b32_e32 v11, v39 +; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_mov_b32_e32 v13, v49 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v28 +; SI-NEXT: v_mov_b32_e32 v1, v29 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v14f64_to_v56f16_scalar: @@ -40020,205 +37039,264 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v48, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_mov_b32_e32 v50, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_mov_b32_e32 v51, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v40, v11 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v44, v7 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: v_mov_b32_e32 v46, v5 +; SI-NEXT: v_mov_b32_e32 v47, v4 +; SI-NEXT: v_mov_b32_e32 v56, v3 +; SI-NEXT: v_mov_b32_e32 v57, v2 +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v59 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v55 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -40262,145 +37340,20 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 -; SI-NEXT: v_or_b32_e32 v2, v42, v2 -; SI-NEXT: v_or_b32_e32 v3, v40, v3 -; SI-NEXT: v_or_b32_e32 v4, v54, v4 -; SI-NEXT: v_or_b32_e32 v5, v52, v5 -; SI-NEXT: v_or_b32_e32 v6, v50, v6 -; SI-NEXT: v_or_b32_e32 v21, v60, v21 -; SI-NEXT: v_or_b32_e32 v22, v58, v22 -; SI-NEXT: v_or_b32_e32 v23, v48, v23 -; SI-NEXT: v_or_b32_e32 v24, v38, v24 -; SI-NEXT: v_or_b32_e32 v25, v36, v25 -; SI-NEXT: v_or_b32_e32 v26, v34, v26 -; SI-NEXT: v_or_b32_e32 v27, v32, v27 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v63 -; SI-NEXT: v_or_b32_e32 v20, v62, v20 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: .LBB54_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -40413,10 +37366,10 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -40425,167 +37378,152 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v40 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -40593,78 +37531,92 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v63 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 @@ -41453,664 +38405,412 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-LABEL: bitcast_v56f16_to_v14f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s17 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s19 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s21 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s8 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v31 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_mov_b32_e32 v32, v13 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_mov_b32_e32 v34, v11 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: v_mov_b32_e32 v36, v9 +; SI-NEXT: v_mov_b32_e32 v37, v8 +; SI-NEXT: v_mov_b32_e32 v38, v7 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v49, v4 +; SI-NEXT: v_mov_b32_e32 v50, v3 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v1 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v37 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v53 ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v58 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v48, v63 -; SI-NEXT: v_or_b32_e32 v5, v63, v5 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v63, v62 -; SI-NEXT: v_or_b32_e32 v6, v62, v6 -; SI-NEXT: v_or_b32_e32 v7, v59, v7 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v59, v47 -; SI-NEXT: v_or_b32_e32 v8, v47, v8 -; SI-NEXT: v_mov_b32_e32 v56, v58 -; SI-NEXT: v_or_b32_e32 v11, v46, v11 -; SI-NEXT: v_or_b32_e32 v12, v45, v12 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: v_or_b32_e32 v14, v41, v14 -; SI-NEXT: v_or_b32_e32 v15, v31, v15 -; SI-NEXT: v_or_b32_e32 v16, v28, v16 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v57 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v10, v33, v10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v57, v27 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v53 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v44 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v38 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v37 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v54 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v57 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v56 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v46 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v39 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v36 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v55 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v33 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 ; SI-NEXT: .LBB55_3: ; %end -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v57, v43 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v46, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v41 -; SI-NEXT: v_mov_b32_e32 v43, v31 -; SI-NEXT: v_mov_b32_e32 v42, v30 -; SI-NEXT: v_mov_b32_e32 v41, v28 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v28, v41 -; SI-NEXT: v_mov_b32_e32 v30, v42 -; SI-NEXT: v_mov_b32_e32 v41, v45 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_mov_b32_e32 v32, v48 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v48, v63 -; SI-NEXT: v_mov_b32_e32 v63, v62 -; SI-NEXT: v_mov_b32_e32 v61, v60 -; SI-NEXT: v_mov_b32_e32 v59, v47 -; SI-NEXT: v_mov_b32_e32 v56, v58 -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v43, v57 ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v56f16_to_v14f64_scalar: @@ -42799,7 +39499,67 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v56i16_to_v56f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v3 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v13 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v58 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -42851,743 +39611,681 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 -; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v44 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v55 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v48 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v54 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_or_b32_e32 v60, v1, v28 +; SI-NEXT: v_mov_b32_e32 v28, v39 +; SI-NEXT: v_mov_b32_e32 v39, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v60, v43, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_or_b32_e32 v58, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v58, v39, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v57, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_mov_b32_e32 v39, v28 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_alignbit_b32 v1, v57, v5, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v47, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_alignbit_b32 v1, v47, v7, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v45, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v45, v46, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v62 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v61 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v44, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v44, v56, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v42, v1, v3 +; SI-NEXT: v_alignbit_b32 v1, v42, v59, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v40, v1, v35 +; SI-NEXT: v_alignbit_b32 v1, v40, v62, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v55, v1, v36 +; SI-NEXT: v_alignbit_b32 v1, v55, v29, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v53, v1, v34 +; SI-NEXT: v_alignbit_b32 v1, v53, v30, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v52, v1, v41 +; SI-NEXT: v_alignbit_b32 v1, v52, v31, 16 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v50, v1, v37 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_alignbit_b32 v1, v50, v32, 16 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: ; kill: killed $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v48, v1, v33 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_alignbit_b32 v1, v48, v63, 16 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v3, v1, v38 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_alignbit_b32 v1, v3, v61, 16 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: .LBB56_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v61, v26 +; SI-NEXT: v_add_i32_e32 v39, vcc, 0x30000, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v26, v38, v26 +; SI-NEXT: v_or_b32_e32 v24, v63, v24 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v24, v33, v24 +; SI-NEXT: v_or_b32_e32 v22, v32, v22 +; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v22, v37, v22 +; SI-NEXT: v_or_b32_e32 v20, v31, v20 +; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v20, v41, v20 +; SI-NEXT: v_or_b32_e32 v18, v30, v18 +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v18, v34, v18 +; SI-NEXT: v_or_b32_e32 v16, v29, v16 +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v16, v36, v16 +; SI-NEXT: v_or_b32_e32 v14, v62, v14 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v56 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v14, v35, v14 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v43, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v60, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v0, v60, v2, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v58 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v60 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v62 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v58, v4, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v57, v6, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v47, v8, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v45, v10, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v44, v12, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v42, v14, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v40, v16, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v55, v18, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v48 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v53, v20, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v52, v22, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v50, v24, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v48, v26, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v28, v39, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v52 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v55 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v49 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v51 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v53 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v56i16_to_v56f16: @@ -44152,577 +40850,629 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-LABEL: bitcast_v56i16_to_v56f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v28, s30, 0 +; SI-NEXT: v_writelane_b32 v28, s31, 1 +; SI-NEXT: v_writelane_b32 v28, s34, 2 +; SI-NEXT: v_writelane_b32 v28, s35, 3 +; SI-NEXT: v_writelane_b32 v28, s36, 4 +; SI-NEXT: v_writelane_b32 v28, s37, 5 +; SI-NEXT: v_writelane_b32 v28, s38, 6 +; SI-NEXT: v_writelane_b32 v28, s39, 7 +; SI-NEXT: v_writelane_b32 v28, s48, 8 +; SI-NEXT: v_writelane_b32 v28, s49, 9 +; SI-NEXT: v_writelane_b32 v28, s50, 10 +; SI-NEXT: v_writelane_b32 v28, s51, 11 +; SI-NEXT: v_writelane_b32 v28, s52, 12 +; SI-NEXT: v_writelane_b32 v28, s53, 13 +; SI-NEXT: v_writelane_b32 v28, s54, 14 +; SI-NEXT: v_writelane_b32 v28, s55, 15 +; SI-NEXT: v_writelane_b32 v28, s64, 16 +; SI-NEXT: v_writelane_b32 v28, s65, 17 +; SI-NEXT: v_writelane_b32 v28, s66, 18 +; SI-NEXT: v_writelane_b32 v28, s67, 19 +; SI-NEXT: v_writelane_b32 v28, s68, 20 +; SI-NEXT: v_writelane_b32 v28, s69, 21 +; SI-NEXT: v_writelane_b32 v28, s70, 22 +; SI-NEXT: v_writelane_b32 v28, s71, 23 +; SI-NEXT: v_writelane_b32 v28, s80, 24 +; SI-NEXT: v_writelane_b32 v28, s81, 25 +; SI-NEXT: v_writelane_b32 v28, s82, 26 +; SI-NEXT: v_writelane_b32 v28, s83, 27 +; SI-NEXT: v_writelane_b32 v28, s84, 28 +; SI-NEXT: v_writelane_b32 v28, s85, 29 +; SI-NEXT: v_writelane_b32 v28, s86, 30 +; SI-NEXT: v_writelane_b32 v28, s87, 31 +; SI-NEXT: ; implicit-def: $vgpr29 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v28, s96, 32 +; SI-NEXT: s_lshr_b32 s66, s29, 16 +; SI-NEXT: s_lshr_b32 s93, s28, 16 +; SI-NEXT: s_lshr_b32 s65, s27, 16 +; SI-NEXT: s_lshr_b32 s91, s26, 16 +; SI-NEXT: s_lshr_b32 s64, s25, 16 +; SI-NEXT: s_lshr_b32 s89, s24, 16 +; SI-NEXT: s_lshr_b32 s55, s23, 16 +; SI-NEXT: s_lshr_b32 s79, s22, 16 +; SI-NEXT: s_lshr_b32 s54, s21, 16 +; SI-NEXT: s_lshr_b32 s77, s20, 16 +; SI-NEXT: s_lshr_b32 s85, s19, 16 +; SI-NEXT: s_lshr_b32 s75, s18, 16 +; SI-NEXT: s_lshr_b32 s83, s17, 16 +; SI-NEXT: s_lshr_b32 s73, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v29, s17, 0 +; SI-NEXT: v_writelane_b32 v28, s97, 33 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_readfirstlane_b32 s50, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; SI-NEXT: v_writelane_b32 v29, s16, 1 +; SI-NEXT: v_writelane_b32 v28, s98, 34 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: v_readfirstlane_b32 s97, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_readfirstlane_b32 s5, v9 +; SI-NEXT: v_writelane_b32 v29, s19, 2 +; SI-NEXT: v_writelane_b32 v28, s99, 35 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_readfirstlane_b32 s99, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_readfirstlane_b32 s7, v11 +; SI-NEXT: v_writelane_b32 v29, s5, 3 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_readfirstlane_b32 s84, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_readfirstlane_b32 s98, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_readfirstlane_b32 s86, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_readfirstlane_b32 s39, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_readfirstlane_b32 s51, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_readfirstlane_b32 s31, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_readfirstlane_b32 s9, v13 +; SI-NEXT: v_writelane_b32 v29, s7, 4 +; SI-NEXT: v_readfirstlane_b32 s87, v4 +; SI-NEXT: v_readfirstlane_b32 s37, v3 +; SI-NEXT: v_readfirstlane_b32 s49, v2 +; SI-NEXT: v_readfirstlane_b32 s95, v1 +; SI-NEXT: v_readfirstlane_b32 s35, v0 +; SI-NEXT: v_readfirstlane_b32 s81, v15 +; SI-NEXT: v_readfirstlane_b32 s82, v16 +; SI-NEXT: v_readfirstlane_b32 s80, v17 +; SI-NEXT: v_readfirstlane_b32 s10, v18 +; SI-NEXT: v_readfirstlane_b32 s71, v19 +; SI-NEXT: v_readfirstlane_b32 s70, v12 +; SI-NEXT: v_readfirstlane_b32 s69, v10 +; SI-NEXT: v_readfirstlane_b32 s68, v8 +; SI-NEXT: v_readfirstlane_b32 s96, v7 +; SI-NEXT: v_readfirstlane_b32 s67, v6 +; SI-NEXT: v_readfirstlane_b32 s53, v5 +; SI-NEXT: v_writelane_b32 v29, s9, 5 +; SI-NEXT: v_writelane_b32 v29, s10, 6 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s13 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s14 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s15 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s40 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v49, s42 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 -; SI-NEXT: v_mov_b32_e32 v28, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v1 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s9 -; SI-NEXT: v_mov_b32_e32 v29, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v30, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v19 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_mov_b32_e32 v31, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v20 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v32, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 -; SI-NEXT: v_mov_b32_e32 v33, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v41 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 -; SI-NEXT: v_mov_b32_e32 v34, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v43 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 -; SI-NEXT: v_mov_b32_e32 v35, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 -; SI-NEXT: v_mov_b32_e32 v19, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v16 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v9 -; SI-NEXT: v_mov_b32_e32 v21, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v10 -; SI-NEXT: v_mov_b32_e32 v16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 -; SI-NEXT: v_mov_b32_e32 v23, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v12 -; SI-NEXT: v_mov_b32_e32 v17, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v13 -; SI-NEXT: v_mov_b32_e32 v18, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_lshl_b32 s40, s5, 16 +; SI-NEXT: s_lshl_b32 s14, s7, 16 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s83, 16 +; SI-NEXT: s_lshl_b32 s72, s73, 16 +; SI-NEXT: s_mov_b32 s76, s73 +; SI-NEXT: s_or_b32 s73, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s85, 16 +; SI-NEXT: s_lshl_b32 s74, s75, 16 +; SI-NEXT: s_lshl_b32 s62, s77, 16 +; SI-NEXT: s_mov_b32 s78, s77 +; SI-NEXT: s_mov_b32 s77, s75 +; SI-NEXT: s_or_b32 s75, s5, s7 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s7, s54, 16 +; SI-NEXT: s_or_b32 s63, s5, s7 +; SI-NEXT: s_and_b32 s5, s23, 0xffff +; SI-NEXT: s_lshl_b32 s7, s55, 16 +; SI-NEXT: s_or_b32 s61, s5, s7 +; SI-NEXT: s_and_b32 s5, s25, 0xffff +; SI-NEXT: s_lshl_b32 s7, s64, 16 +; SI-NEXT: s_or_b32 s59, s5, s7 +; SI-NEXT: s_and_b32 s5, s27, 0xffff +; SI-NEXT: s_lshl_b32 s7, s65, 16 +; SI-NEXT: s_or_b32 s57, s5, s7 +; SI-NEXT: s_and_b32 s5, s29, 0xffff +; SI-NEXT: s_lshl_b32 s7, s66, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s95, 0xffff +; SI-NEXT: s_lshl_b32 s7, s67, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: s_and_b32 s5, s37, 0xffff +; SI-NEXT: s_lshl_b32 s7, s68, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: s_and_b32 s5, s31, 0xffff +; SI-NEXT: s_lshl_b32 s7, s69, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_and_b32 s5, s39, 0xffff +; SI-NEXT: s_lshl_b32 s7, s70, 16 +; SI-NEXT: s_or_b32 s15, s5, s7 +; SI-NEXT: s_and_b32 s5, s50, 0xffff +; SI-NEXT: s_lshl_b32 s7, s71, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_or_b32 s13, s5, s7 +; SI-NEXT: s_and_b32 s5, s97, 0xffff +; SI-NEXT: s_lshl_b32 s7, s80, 16 +; SI-NEXT: s_or_b32 s8, s4, s72 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_or_b32 s11, s5, s7 +; SI-NEXT: s_and_b32 s5, s99, 0xffff +; SI-NEXT: s_lshl_b32 s7, s81, 16 +; SI-NEXT: s_or_b32 s6, s4, s74 +; SI-NEXT: s_lshl_b32 s12, s9, 16 +; SI-NEXT: s_or_b32 vcc_hi, s5, s7 +; SI-NEXT: s_mov_b32 s9, s73 +; SI-NEXT: s_lshr_b64 s[72:73], s[72:73], 16 +; SI-NEXT: s_mov_b32 s7, s75 +; SI-NEXT: s_lshr_b64 s[74:75], s[74:75], 16 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s60, s79, 16 +; SI-NEXT: s_mov_b32 s73, s76 +; SI-NEXT: s_mov_b32 s75, s77 +; SI-NEXT: s_lshr_b64 s[76:77], s[62:63], 16 +; SI-NEXT: s_or_b32 s4, s4, s62 +; SI-NEXT: s_lshl_b32 s58, s89, 16 +; SI-NEXT: s_mov_b32 s77, s78 +; SI-NEXT: s_and_b32 s62, s22, 0xffff +; SI-NEXT: s_mov_b32 s88, s79 +; SI-NEXT: s_lshr_b64 s[78:79], s[60:61], 16 +; SI-NEXT: s_lshl_b32 s56, s91, 16 +; SI-NEXT: s_or_b32 s62, s62, s60 +; SI-NEXT: s_mov_b32 s79, s88 +; SI-NEXT: s_and_b32 s60, s24, 0xffff +; SI-NEXT: s_mov_b32 s90, s89 +; SI-NEXT: s_lshr_b64 s[88:89], s[58:59], 16 +; SI-NEXT: s_lshl_b32 s46, s93, 16 +; SI-NEXT: s_or_b32 s60, s60, s58 +; SI-NEXT: s_mov_b32 s89, s90 +; SI-NEXT: s_and_b32 s58, s26, 0xffff +; SI-NEXT: s_mov_b32 s92, s91 +; SI-NEXT: s_lshr_b64 s[90:91], s[56:57], 16 +; SI-NEXT: s_lshl_b32 s44, s53, 16 +; SI-NEXT: s_or_b32 s58, s58, s56 +; SI-NEXT: s_mov_b32 s91, s92 +; SI-NEXT: s_and_b32 s56, s28, 0xffff +; SI-NEXT: s_mov_b32 s94, s93 +; SI-NEXT: s_lshr_b64 s[92:93], s[46:47], 16 +; SI-NEXT: s_lshl_b32 s42, s96, 16 +; SI-NEXT: s_or_b32 s56, s56, s46 +; SI-NEXT: s_mov_b32 s93, s94 +; SI-NEXT: s_and_b32 s46, s35, 0xffff +; SI-NEXT: s_mov_b32 s30, s95 +; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 +; SI-NEXT: s_or_b32 s46, s46, s44 +; SI-NEXT: s_mov_b32 s95, s30 +; SI-NEXT: s_and_b32 s44, s49, 0xffff +; SI-NEXT: s_mov_b32 s34, s31 +; SI-NEXT: s_lshr_b64 s[30:31], s[42:43], 16 +; SI-NEXT: s_or_b32 s44, s44, s42 +; SI-NEXT: s_mov_b32 s31, s34 +; SI-NEXT: s_and_b32 s42, s87, 0xffff +; SI-NEXT: s_mov_b32 s36, s35 +; SI-NEXT: s_lshr_b64 s[34:35], s[40:41], 16 +; SI-NEXT: s_or_b32 s42, s42, s40 +; SI-NEXT: s_mov_b32 s35, s36 +; SI-NEXT: s_and_b32 s40, s51, 0xffff +; SI-NEXT: s_mov_b32 s38, s37 +; SI-NEXT: s_lshr_b64 s[36:37], s[14:15], 16 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s40, s40, s14 +; SI-NEXT: s_mov_b32 s37, s38 +; SI-NEXT: s_and_b32 s14, s86, 0xffff +; SI-NEXT: s_mov_b32 s48, s39 +; SI-NEXT: s_lshr_b64 s[38:39], s[12:13], 16 +; SI-NEXT: s_lshl_b32 vcc_lo, s82, 16 +; SI-NEXT: s_or_b32 s14, s14, s12 +; SI-NEXT: s_mov_b32 s39, s48 +; SI-NEXT: s_and_b32 s12, s98, 0xffff +; SI-NEXT: s_mov_b32 s16, s82 +; SI-NEXT: s_mov_b32 s82, s99 +; SI-NEXT: s_mov_b32 s99, s98 +; SI-NEXT: s_mov_b32 s98, s96 +; SI-NEXT: s_mov_b32 s96, s86 +; SI-NEXT: s_mov_b32 s86, s50 +; SI-NEXT: s_mov_b32 s50, s49 +; SI-NEXT: s_lshr_b64 s[48:49], s[10:11], 16 +; SI-NEXT: s_or_b32 s12, s12, s10 +; SI-NEXT: s_mov_b32 s49, s50 +; SI-NEXT: s_mov_b32 s50, s86 +; SI-NEXT: s_mov_b32 s86, s96 +; SI-NEXT: s_and_b32 s10, s84, 0xffff +; SI-NEXT: s_mov_b32 s96, s53 +; SI-NEXT: s_lshr_b64 s[52:53], vcc, 16 +; SI-NEXT: s_mov_b32 s5, s63 +; SI-NEXT: s_mov_b32 s63, s61 +; SI-NEXT: s_mov_b32 s61, s59 +; SI-NEXT: s_mov_b32 s59, s57 +; SI-NEXT: s_mov_b32 s57, s47 +; SI-NEXT: s_mov_b32 s47, s45 +; SI-NEXT: s_mov_b32 s45, s43 +; SI-NEXT: s_mov_b32 s43, s41 +; SI-NEXT: s_mov_b32 s41, s15 +; SI-NEXT: s_mov_b32 s15, s13 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_or_b32 s10, s10, vcc_lo +; SI-NEXT: s_mov_b32 s11, vcc_hi +; SI-NEXT: s_mov_b32 s53, s96 +; SI-NEXT: s_mov_b32 s96, s98 +; SI-NEXT: s_mov_b32 s98, s99 +; SI-NEXT: s_mov_b32 s99, s82 +; SI-NEXT: s_mov_b32 s82, s16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s22 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s8 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, s26 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s9 +; SI-NEXT: s_add_i32 s84, s84, 3 +; SI-NEXT: s_and_b32 s4, s84, 0xffff +; SI-NEXT: s_lshl_b32 s5, s82, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s82, s99, 3 +; SI-NEXT: s_add_i32 s10, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s82, 0xffff +; SI-NEXT: s_lshl_b32 s5, s81, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s99, s98, 3 +; SI-NEXT: v_readlane_b32 s5, v29, 6 +; SI-NEXT: s_add_i32 s11, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s99, 0xffff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s12, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s97, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s80, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s13, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s86, 3 +; SI-NEXT: v_readlane_b32 s5, v29, 5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s14, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s50, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s71, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s15, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s51, 3 +; SI-NEXT: v_readlane_b32 s5, v29, 4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s39, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s70, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s87, 3 +; SI-NEXT: v_readlane_b32 s5, v29, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s31, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s69, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s49, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s96, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s44, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s37, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s68, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s45, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s35, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s53, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s46, s4, 0x30000 +; SI-NEXT: s_add_i32 s4, s95, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s67, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s28 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v18 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s21 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v23 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v29 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v28 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s47, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s93, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s56, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s66, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s57, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s91, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s14 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v48, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v49, s42 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_add_i32 s58, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s65, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s59, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s89, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s60, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s64, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s61, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s79, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s62, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s55, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s63, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s77, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s21, 0xffff +; SI-NEXT: s_lshl_b32 s6, s54, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s75, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_readlane_b32 s7, v29, 2 +; SI-NEXT: s_add_i32 s19, s7, 3 +; SI-NEXT: s_and_b32 s7, s19, 0xffff +; SI-NEXT: s_lshl_b32 s8, s85, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_readlane_b32 s8, v29, 1 +; SI-NEXT: s_add_i32 s16, s8, 3 +; SI-NEXT: s_and_b32 s8, s16, 0xffff +; SI-NEXT: s_lshl_b32 s9, s73, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_readlane_b32 s9, v29, 0 +; SI-NEXT: s_add_i32 s17, s9, 3 +; SI-NEXT: s_and_b32 s9, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s83, 16 +; SI-NEXT: s_or_b32 s9, s16, s9 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_lshr_b64 s[72:73], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[10:11], 16 +; SI-NEXT: s_lshr_b32 s83, s9, 16 +; SI-NEXT: s_lshr_b32 s85, s7, 16 +; SI-NEXT: s_lshr_b32 s54, s5, 16 +; SI-NEXT: s_lshr_b32 s55, s63, 16 +; SI-NEXT: s_lshr_b32 s64, s61, 16 +; SI-NEXT: s_lshr_b32 s65, s59, 16 +; SI-NEXT: s_lshr_b32 s66, s57, 16 +; SI-NEXT: s_lshr_b32 s67, s47, 16 +; SI-NEXT: s_lshr_b32 s68, s45, 16 +; SI-NEXT: s_lshr_b32 s69, s43, 16 +; SI-NEXT: s_lshr_b32 s70, s41, 16 +; SI-NEXT: s_lshr_b32 s71, s15, 16 +; SI-NEXT: s_lshr_b32 s80, s13, 16 +; SI-NEXT: s_lshr_b32 s81, s11, 16 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v36 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v39 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v49 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v51 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v46 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v54 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v57 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v55 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v61 -; SI-NEXT: v_or_b32_e32 v19, v23, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v41 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 -; SI-NEXT: v_or_b32_e32 v20, v23, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v63 -; SI-NEXT: v_or_b32_e32 v21, v29, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_or_b32_e32 v22, v28, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v58 -; SI-NEXT: v_or_b32_e32 v23, v30, v23 -; SI-NEXT: v_or_b32_e32 v24, v28, v24 -; SI-NEXT: v_or_b32_e32 v25, v29, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v59 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s16, s72, 16 +; SI-NEXT: s_or_b32 s8, s8, s16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s16, s83, 16 +; SI-NEXT: s_or_b32 s9, s9, s16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s16, s74, 16 +; SI-NEXT: s_or_b32 s6, s6, s16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s16, s85, 16 +; SI-NEXT: s_or_b32 s7, s7, s16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s16, s76, 16 +; SI-NEXT: s_or_b32 s4, s4, s16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s16, s54, 16 +; SI-NEXT: s_or_b32 s5, s5, s16 +; SI-NEXT: s_and_b32 s16, s62, 0xffff +; SI-NEXT: s_lshl_b32 s17, s78, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s63, 0xffff +; SI-NEXT: s_lshl_b32 s18, s55, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s60, 0xffff +; SI-NEXT: s_lshl_b32 s19, s88, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s61, 0xffff +; SI-NEXT: s_lshl_b32 s20, s64, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s58, 0xffff +; SI-NEXT: s_lshl_b32 s21, s90, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_and_b32 s21, s59, 0xffff +; SI-NEXT: s_lshl_b32 s22, s65, 16 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_and_b32 s22, s56, 0xffff +; SI-NEXT: s_lshl_b32 s23, s92, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_and_b32 s23, s57, 0xffff +; SI-NEXT: s_lshl_b32 s24, s66, 16 +; SI-NEXT: s_or_b32 s23, s23, s24 +; SI-NEXT: s_and_b32 s24, s46, 0xffff +; SI-NEXT: s_lshl_b32 s25, s94, 16 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_and_b32 s25, s47, 0xffff +; SI-NEXT: s_lshl_b32 s26, s67, 16 +; SI-NEXT: s_or_b32 s25, s25, s26 +; SI-NEXT: s_and_b32 s26, s44, 0xffff +; SI-NEXT: s_lshl_b32 s27, s30, 16 +; SI-NEXT: s_or_b32 s26, s26, s27 +; SI-NEXT: s_and_b32 s27, s45, 0xffff +; SI-NEXT: s_lshl_b32 s28, s68, 16 +; SI-NEXT: s_or_b32 s27, s27, s28 +; SI-NEXT: s_and_b32 s28, s42, 0xffff +; SI-NEXT: s_lshl_b32 s29, s34, 16 +; SI-NEXT: s_or_b32 s28, s28, s29 +; SI-NEXT: s_and_b32 s29, s43, 0xffff +; SI-NEXT: s_lshl_b32 s42, s69, 16 +; SI-NEXT: s_or_b32 s29, s29, s42 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_lshl_b32 s42, s36, 16 +; SI-NEXT: s_or_b32 s40, s40, s42 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s42, s70, 16 +; SI-NEXT: s_or_b32 s41, s41, s42 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s42, s38, 16 +; SI-NEXT: s_or_b32 s14, s14, s42 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s42, s71, 16 +; SI-NEXT: s_or_b32 s15, s15, s42 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s42, s48, 16 +; SI-NEXT: s_or_b32 s12, s12, s42 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s42, s80, 16 +; SI-NEXT: s_or_b32 s13, s13, s42 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s42, s52, 16 +; SI-NEXT: s_or_b32 s10, s10, s42 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s42, s81, 16 +; SI-NEXT: s_or_b32 s11, s11, s42 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: v_mov_b32_e32 v7, s17 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v14, s24 +; SI-NEXT: v_mov_b32_e32 v15, s25 +; SI-NEXT: v_mov_b32_e32 v16, s26 +; SI-NEXT: v_mov_b32_e32 v17, s27 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: v_mov_b32_e32 v20, s40 +; SI-NEXT: v_mov_b32_e32 v21, s41 +; SI-NEXT: v_mov_b32_e32 v22, s14 +; SI-NEXT: v_mov_b32_e32 v23, s15 +; SI-NEXT: v_mov_b32_e32 v24, s12 +; SI-NEXT: v_mov_b32_e32 v25, s13 +; SI-NEXT: v_mov_b32_e32 v26, s10 +; SI-NEXT: v_mov_b32_e32 v27, s11 +; SI-NEXT: v_readlane_b32 s99, v28, 35 +; SI-NEXT: v_readlane_b32 s98, v28, 34 +; SI-NEXT: v_readlane_b32 s97, v28, 33 +; SI-NEXT: v_readlane_b32 s96, v28, 32 +; SI-NEXT: v_readlane_b32 s87, v28, 31 +; SI-NEXT: v_readlane_b32 s86, v28, 30 +; SI-NEXT: v_readlane_b32 s85, v28, 29 +; SI-NEXT: v_readlane_b32 s84, v28, 28 +; SI-NEXT: v_readlane_b32 s83, v28, 27 +; SI-NEXT: v_readlane_b32 s82, v28, 26 +; SI-NEXT: v_readlane_b32 s81, v28, 25 +; SI-NEXT: v_readlane_b32 s80, v28, 24 +; SI-NEXT: v_readlane_b32 s71, v28, 23 +; SI-NEXT: v_readlane_b32 s70, v28, 22 +; SI-NEXT: v_readlane_b32 s69, v28, 21 +; SI-NEXT: v_readlane_b32 s68, v28, 20 +; SI-NEXT: v_readlane_b32 s67, v28, 19 +; SI-NEXT: v_readlane_b32 s66, v28, 18 +; SI-NEXT: v_readlane_b32 s65, v28, 17 +; SI-NEXT: v_readlane_b32 s64, v28, 16 +; SI-NEXT: v_readlane_b32 s55, v28, 15 +; SI-NEXT: v_readlane_b32 s54, v28, 14 +; SI-NEXT: v_readlane_b32 s53, v28, 13 +; SI-NEXT: v_readlane_b32 s52, v28, 12 +; SI-NEXT: v_readlane_b32 s51, v28, 11 +; SI-NEXT: v_readlane_b32 s50, v28, 10 +; SI-NEXT: v_readlane_b32 s49, v28, 9 +; SI-NEXT: v_readlane_b32 s48, v28, 8 +; SI-NEXT: v_readlane_b32 s39, v28, 7 +; SI-NEXT: v_readlane_b32 s38, v28, 6 +; SI-NEXT: v_readlane_b32 s37, v28, 5 +; SI-NEXT: v_readlane_b32 s36, v28, 4 +; SI-NEXT: v_readlane_b32 s35, v28, 3 +; SI-NEXT: v_readlane_b32 s34, v28, 2 +; SI-NEXT: v_readlane_b32 s31, v28, 1 +; SI-NEXT: v_readlane_b32 s30, v28, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: v_mov_b32_e32 v35, v45 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: v_mov_b32_e32 v34, v43 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: v_mov_b32_e32 v33, v41 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v32, v23 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v31, v20 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v30, v19 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v29, v18 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v28, v17 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v19, v16 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v18, v27 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v17, v26 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v23, v25 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: v_mov_b32_e32 v16, v24 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: v_mov_b32_e32 v21, v22 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; kill: killed $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v56i16_to_v56f16_scalar: @@ -45611,10 +42361,6 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v56f16_to_v56i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill @@ -45628,145 +42374,37 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v34 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v49 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v34 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v49 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v34 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v36 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v39 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v49 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v57 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -45811,258 +42449,260 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v54 ; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_or_b32_e32 v27, v27, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v29, v29, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v25, v25, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v30, v30, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_or_b32_e32 v23, v23, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v24, v24, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_or_b32_e32 v21, v21, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v31, v31, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_or_b32_e32 v19, v19, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_or_b32_e32 v33, v33, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_or_b32_e32 v18, v18, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_or_b32_e32 v17, v17, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_or_b32_e32 v15, v15, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_or_b32_e32 v34, v34, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_or_b32_e32 v13, v13, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v36, v36, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: v_or_b32_e32 v12, v12, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v9 +; SI-NEXT: v_or_b32_e32 v11, v11, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_or_b32_e32 v37, v37, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v9, v9, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_or_b32_e32 v39, v39, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v5 -; SI-NEXT: v_or_b32_e32 v6, v6, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v3 -; SI-NEXT: v_or_b32_e32 v50, v50, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_or_b32_e32 v7, v7, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v50 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v56 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v47 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v44 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v41 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_or_b32_e32 v2, v2, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v54 ; SI-NEXT: v_or_b32_e32 v0, v0, v28 -; SI-NEXT: v_or_b32_e32 v49, v49, v46 +; SI-NEXT: v_or_b32_e32 v2, v2, v46 ; SI-NEXT: v_or_b32_e32 v4, v4, v45 -; SI-NEXT: v_or_b32_e32 v48, v48, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v57 ; SI-NEXT: v_or_b32_e32 v8, v8, v43 ; SI-NEXT: v_or_b32_e32 v10, v10, v42 -; SI-NEXT: v_or_b32_e32 v38, v38, v58 +; SI-NEXT: v_or_b32_e32 v12, v12, v58 ; SI-NEXT: v_or_b32_e32 v14, v14, v40 ; SI-NEXT: v_or_b32_e32 v16, v16, v55 -; SI-NEXT: v_or_b32_e32 v35, v35, v59 +; SI-NEXT: v_or_b32_e32 v18, v18, v59 ; SI-NEXT: v_or_b32_e32 v20, v20, v53 ; SI-NEXT: v_or_b32_e32 v22, v22, v52 -; SI-NEXT: v_or_b32_e32 v32, v32, v60 +; SI-NEXT: v_or_b32_e32 v24, v24, v60 ; SI-NEXT: v_or_b32_e32 v26, v26, v51 -; SI-NEXT: v_alignbit_b32 v56, v2, v28, 16 -; SI-NEXT: v_alignbit_b32 v47, v50, v46, 16 -; SI-NEXT: v_alignbit_b32 v46, v6, v45, 16 -; SI-NEXT: v_alignbit_b32 v45, v39, v57, 16 -; SI-NEXT: v_alignbit_b32 v44, v37, v43, 16 -; SI-NEXT: v_alignbit_b32 v43, v12, v42, 16 -; SI-NEXT: v_alignbit_b32 v42, v36, v58, 16 -; SI-NEXT: v_alignbit_b32 v41, v34, v40, 16 -; SI-NEXT: v_alignbit_b32 v40, v18, v55, 16 -; SI-NEXT: v_alignbit_b32 v55, v33, v59, 16 -; SI-NEXT: v_alignbit_b32 v54, v31, v53, 16 -; SI-NEXT: v_alignbit_b32 v53, v24, v52, 16 -; SI-NEXT: v_alignbit_b32 v52, v30, v60, 16 -; SI-NEXT: v_alignbit_b32 v51, v29, v51, 16 +; SI-NEXT: v_alignbit_b32 v56, v1, v28, 16 +; SI-NEXT: v_alignbit_b32 v47, v3, v46, 16 +; SI-NEXT: v_alignbit_b32 v46, v5, v45, 16 +; SI-NEXT: v_alignbit_b32 v45, v7, v57, 16 +; SI-NEXT: v_alignbit_b32 v44, v9, v43, 16 +; SI-NEXT: v_alignbit_b32 v43, v11, v42, 16 +; SI-NEXT: v_alignbit_b32 v42, v13, v58, 16 +; SI-NEXT: v_alignbit_b32 v41, v15, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, v17, v55, 16 +; SI-NEXT: v_alignbit_b32 v55, v19, v59, 16 +; SI-NEXT: v_alignbit_b32 v54, v21, v53, 16 +; SI-NEXT: v_alignbit_b32 v53, v23, v52, 16 +; SI-NEXT: v_alignbit_b32 v52, v25, v60, 16 +; SI-NEXT: v_alignbit_b32 v51, v27, v51, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v56 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v28 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v28 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v47 ; SI-NEXT: v_or_b32_e32 v2, v2, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v49 +; SI-NEXT: v_or_b32_e32 v3, v3, v28 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v46 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v28 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v45 ; SI-NEXT: v_or_b32_e32 v6, v6, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v28 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v44 ; SI-NEXT: v_or_b32_e32 v8, v8, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v28, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v38 +; SI-NEXT: v_or_b32_e32 v9, v9, v28 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v43 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v10, v10, v28 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v28 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v42 ; SI-NEXT: v_or_b32_e32 v12, v12, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v28, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v36 +; SI-NEXT: v_or_b32_e32 v13, v13, v28 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v41 ; SI-NEXT: v_or_b32_e32 v14, v14, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v28, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 +; SI-NEXT: v_or_b32_e32 v15, v15, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload @@ -46079,40 +42719,40 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v16, v16, v28 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v34 +; SI-NEXT: v_or_b32_e32 v17, v17, v28 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v55 ; SI-NEXT: v_or_b32_e32 v18, v18, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v28, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v28 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v54 ; SI-NEXT: v_or_b32_e32 v20, v20, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v28, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v32 +; SI-NEXT: v_or_b32_e32 v21, v21, v28 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v53 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v22, v22, v28 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_or_b32_e32 v23, v23, v28 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v52 ; SI-NEXT: v_or_b32_e32 v24, v24, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v28, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; SI-NEXT: v_or_b32_e32 v25, v25, v28 ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v51 ; SI-NEXT: v_or_b32_e32 v26, v26, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -46679,15 +43319,28 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-LABEL: bitcast_v56f16_to_v56i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 -; SI-NEXT: s_lshr_b32 s42, s17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v25, v10 +; SI-NEXT: v_mov_b32_e32 v30, v8 +; SI-NEXT: v_mov_b32_e32 v31, v6 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v20, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s28, 16 +; SI-NEXT: s_lshr_b32 s11, s27, 16 +; SI-NEXT: s_lshr_b32 s14, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s15, s24, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s20, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -46704,521 +43357,412 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: s_lshr_b32 s40, s19, 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 -; SI-NEXT: s_lshr_b32 s12, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; SI-NEXT: s_cbranch_scc0 .LBB59_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_4 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s18 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_lshr_b32 s6, s29, 16 -; SI-NEXT: s_lshr_b32 s8, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v4, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: s_lshr_b32 s7, s28, 16 -; SI-NEXT: s_lshr_b32 s9, s26, 16 -; SI-NEXT: s_lshr_b32 s11, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s22, 16 -; SI-NEXT: s_lshr_b32 s15, s20, 16 -; SI-NEXT: s_lshr_b32 s41, s18, 16 -; SI-NEXT: s_lshr_b32 s43, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v63, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v47, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v24 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB59_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 -; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 -; SI-NEXT: v_mov_b32_e32 v31, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s15 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v4, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s13 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v52, v0, v37 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s28 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v43 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v34 -; SI-NEXT: v_or_b32_e32 v34, v26, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v52 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v20, v6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v41, v4, v53 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_or_b32_e32 v19, v8, v2 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v18, v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v31, v10, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v25 +; SI-NEXT: v_or_b32_e32 v30, v10, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v49 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v32 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v26, v15, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v23, v17, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v15 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v26, v29, v2 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v39, v29, v4 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v17 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v24 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v46 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v15 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v50 +; SI-NEXT: v_or_b32_e32 v7, v7, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v58 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 +; SI-NEXT: v_or_b32_e32 v3, v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v29, v29, v10 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_or_b32_e32 v26, v26, v6 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v26, v30, v8 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v61 +; SI-NEXT: v_or_b32_e32 v54, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s10 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v62 +; SI-NEXT: v_or_b32_e32 v38, v21, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s9 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v63 +; SI-NEXT: v_or_b32_e32 v36, v15, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s8 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v59 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v22 +; SI-NEXT: v_or_b32_e32 v28, v21, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s7 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v60 +; SI-NEXT: v_lshr_b64 v[45:46], v[27:28], 16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_or_b32_e32 v34, v17, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 +; SI-NEXT: v_or_b32_e32 v17, v17, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v15, v15, v21 +; SI-NEXT: v_lshr_b64 v[57:58], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[55:56], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v16, v41 +; SI-NEXT: v_lshr_b64 v[41:42], v[37:38], 16 +; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: v_mov_b32_e32 v56, v48 +; SI-NEXT: v_lshr_b64 v[47:48], v[33:34], 16 +; SI-NEXT: v_mov_b32_e32 v46, v50 +; SI-NEXT: v_lshr_b64 v[43:44], v[35:36], 16 +; SI-NEXT: v_mov_b32_e32 v42, v39 +; SI-NEXT: v_lshr_b64 v[39:40], v[53:54], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v44, v49 +; SI-NEXT: v_mov_b32_e32 v40, v25 +; SI-NEXT: v_mov_b32_e32 v51, v24 +; SI-NEXT: v_lshr_b64 v[48:49], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[8:9], 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v25, v26 +; SI-NEXT: v_lshr_b64 v[26:27], v[12:13], 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v14, v52 +; SI-NEXT: v_lshr_b64 v[52:53], v[0:1], 16 +; SI-NEXT: s_branch .LBB59_5 +; SI-NEXT: .LBB59_3: +; SI-NEXT: s_branch .LBB59_2 +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, s12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, s11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, s10 +; SI-NEXT: v_mov_b32_e32 v59, s9 +; SI-NEXT: v_mov_b32_e32 v60, s8 +; SI-NEXT: v_mov_b32_e32 v32, s7 +; SI-NEXT: v_mov_b32_e32 v29, s6 +; SI-NEXT: v_mov_b32_e32 v15, s17 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v34, s21 +; SI-NEXT: v_mov_b32_e32 v28, s23 +; SI-NEXT: v_mov_b32_e32 v36, s25 +; SI-NEXT: v_mov_b32_e32 v38, s27 +; SI-NEXT: v_mov_b32_e32 v54, s29 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v57, s43 +; SI-NEXT: v_mov_b32_e32 v55, s42 +; SI-NEXT: v_mov_b32_e32 v47, s41 +; SI-NEXT: v_mov_b32_e32 v45, s40 +; SI-NEXT: v_mov_b32_e32 v43, s15 +; SI-NEXT: v_mov_b32_e32 v41, s14 +; SI-NEXT: v_mov_b32_e32 v39, s13 +; SI-NEXT: .LBB59_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v29, v29, v16 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v31 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_or_b32_e32 v26, v26, v12 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v26, v30, v14 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v53 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_or_b32_e32 v48, v26, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v55 -; SI-NEXT: v_or_b32_e32 v36, v30, v20 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_or_b32_e32 v38, v26, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v28, v28, v24 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v31, v29, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v33 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v30 -; SI-NEXT: v_or_b32_e32 v25, v25, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v56 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 -; SI-NEXT: v_or_b32_e32 v23, v23, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v49 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v56 -; SI-NEXT: v_or_b32_e32 v21, v21, v28 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v33 -; SI-NEXT: v_or_b32_e32 v19, v19, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v62 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v37 -; SI-NEXT: v_or_b32_e32 v17, v17, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v63 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v62 -; SI-NEXT: v_or_b32_e32 v15, v15, v28 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v63 -; SI-NEXT: v_or_b32_e32 v13, v13, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v57 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v61 -; SI-NEXT: v_or_b32_e32 v11, v11, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v59 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v28 -; SI-NEXT: v_lshr_b64 v[52:53], v[10:11], 16 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v60 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v57 -; SI-NEXT: v_or_b32_e32 v9, v9, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v59 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v29 -; SI-NEXT: v_or_b32_e32 v7, v7, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v41 -; SI-NEXT: v_lshr_b64 v[40:41], v[6:7], 16 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v60 -; SI-NEXT: v_or_b32_e32 v5, v5, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v58 -; SI-NEXT: v_lshr_b64 v[42:43], v[4:5], 16 -; SI-NEXT: v_lshr_b64 v[54:55], v[8:9], 16 -; SI-NEXT: v_mov_b32_e32 v41, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_or_b32_e32 v3, v3, v28 -; SI-NEXT: v_lshr_b64 v[44:45], v[2:3], 16 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v28 -; SI-NEXT: v_lshr_b64 v[28:29], v[26:27], 16 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_lshr_b64 v[46:47], v[0:1], 16 -; SI-NEXT: v_mov_b32_e32 v47, v48 -; SI-NEXT: v_mov_b32_e32 v45, v37 -; SI-NEXT: v_mov_b32_e32 v43, v39 -; SI-NEXT: v_mov_b32_e32 v55, v38 -; SI-NEXT: v_mov_b32_e32 v53, v36 -; SI-NEXT: v_lshr_b64 v[50:51], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[14:15], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[16:17], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v51, v35 -; SI-NEXT: v_mov_b32_e32 v49, v33 -; SI-NEXT: v_mov_b32_e32 v39, v34 -; SI-NEXT: v_mov_b32_e32 v37, v32 -; SI-NEXT: v_lshr_b64 v[34:35], v[20:21], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[22:23], 16 -; SI-NEXT: v_mov_b32_e32 v35, v31 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_lshr_b64 v[30:31], v[24:25], 16 -; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v43 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v38 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v34 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v32 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_or_b32_e32 v33, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v41 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_or_b32_e32 v37, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v60 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v40 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v60 +; SI-NEXT: v_or_b32_e32 v35, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v45 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v54 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v59 +; SI-NEXT: v_or_b32_e32 v28, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v57 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v52 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_or_b32_e32 v34, v10, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v41 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v62 +; SI-NEXT: v_or_b32_e32 v36, v12, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 ; SI-NEXT: v_or_b32_e32 v12, v12, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v63 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v62 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v45 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v49 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v56 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v61 +; SI-NEXT: v_or_b32_e32 v38, v14, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v52 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v58 +; SI-NEXT: v_or_b32_e32 v15, v1, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 +; SI-NEXT: v_or_b32_e32 v16, v1, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 +; SI-NEXT: v_or_b32_e32 v17, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_or_b32_e32 v18, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_or_b32_e32 v19, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_or_b32_e32 v20, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: v_or_b32_e32 v21, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; SI-NEXT: v_or_b32_e32 v23, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_or_b32_e32 v25, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -47235,24 +43779,17 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v37 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v30 -; SI-NEXT: v_or_b32_e32 v24, v24, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v26, v26, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v51 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v27, v1, v3 +; SI-NEXT: v_mov_b32_e32 v1, v33 +; SI-NEXT: v_mov_b32_e32 v3, v37 +; SI-NEXT: v_mov_b32_e32 v5, v35 +; SI-NEXT: v_mov_b32_e32 v7, v28 +; SI-NEXT: v_mov_b32_e32 v9, v34 +; SI-NEXT: v_mov_b32_e32 v11, v36 +; SI-NEXT: v_mov_b32_e32 v13, v38 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v56f16_to_v56i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index c4d17c79d773e..4fe874215b3f8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -7374,617 +7374,265 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v49, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v52, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v41, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v27 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v25 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB16_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_mov_b32_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v49, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v52, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v41, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_mov_b32_e32 v59, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_mov_b32_e32 v34, v27 -; SI-NEXT: v_mov_b32_e32 v32, v29 -; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v60, v25 -; SI-NEXT: v_mov_b32_e32 v57, v26 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v41 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v12, v12, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 +; SI-NEXT: v_or_b32_e32 v16, v16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v18, v18, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v36 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v37 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v63 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v58 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v59 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v57 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v49 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v53 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v20, v20, v34 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v33 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_or_b32_e32 v24, v24, v32 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v26, v26, v31 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v50 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_or_b32_e32 v7, v7, v52 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v13, v13, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v37 +; SI-NEXT: v_or_b32_e32 v17, v17, v36 +; SI-NEXT: v_or_b32_e32 v19, v19, v35 +; SI-NEXT: v_or_b32_e32 v21, v21, v34 +; SI-NEXT: v_or_b32_e32 v23, v23, v33 +; SI-NEXT: v_or_b32_e32 v25, v25, v32 +; SI-NEXT: v_or_b32_e32 v27, v27, v31 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8633,174 +8281,117 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-LABEL: bitcast_v30i32_to_v60f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v30, s30, 0 +; SI-NEXT: v_writelane_b32 v30, s31, 1 +; SI-NEXT: v_writelane_b32 v30, s34, 2 +; SI-NEXT: v_writelane_b32 v30, s35, 3 +; SI-NEXT: v_writelane_b32 v30, s36, 4 +; SI-NEXT: v_writelane_b32 v30, s37, 5 +; SI-NEXT: v_writelane_b32 v30, s38, 6 +; SI-NEXT: v_writelane_b32 v30, s39, 7 +; SI-NEXT: v_writelane_b32 v30, s48, 8 +; SI-NEXT: v_writelane_b32 v30, s49, 9 +; SI-NEXT: v_writelane_b32 v30, s50, 10 ; SI-NEXT: v_mov_b32_e32 v17, s16 ; SI-NEXT: v_mov_b32_e32 v18, s17 +; SI-NEXT: v_writelane_b32 v30, s51, 11 ; SI-NEXT: v_mov_b32_e32 v19, s18 -; SI-NEXT: v_readfirstlane_b32 s40, v17 +; SI-NEXT: v_readfirstlane_b32 s46, v17 ; SI-NEXT: v_mov_b32_e32 v17, s19 -; SI-NEXT: v_readfirstlane_b32 s41, v18 +; SI-NEXT: v_readfirstlane_b32 s47, v18 ; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_readfirstlane_b32 s42, v19 +; SI-NEXT: v_writelane_b32 v30, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s44, v19 ; SI-NEXT: v_mov_b32_e32 v19, s21 -; SI-NEXT: v_readfirstlane_b32 s43, v17 +; SI-NEXT: v_readfirstlane_b32 s45, v17 ; SI-NEXT: v_mov_b32_e32 v17, s22 -; SI-NEXT: v_readfirstlane_b32 s44, v18 +; SI-NEXT: v_readfirstlane_b32 s42, v18 ; SI-NEXT: v_mov_b32_e32 v18, s23 -; SI-NEXT: v_readfirstlane_b32 s45, v19 +; SI-NEXT: v_writelane_b32 v30, s53, 13 +; SI-NEXT: v_readfirstlane_b32 s43, v19 ; SI-NEXT: v_mov_b32_e32 v19, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v17 +; SI-NEXT: v_readfirstlane_b32 s40, v17 ; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_readfirstlane_b32 s25, v18 +; SI-NEXT: v_readfirstlane_b32 s41, v18 ; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_readfirstlane_b32 s26, v19 +; SI-NEXT: v_writelane_b32 v30, s54, 14 +; SI-NEXT: v_readfirstlane_b32 s24, v19 ; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v17 +; SI-NEXT: v_readfirstlane_b32 s25, v17 ; SI-NEXT: v_mov_b32_e32 v17, s28 -; SI-NEXT: v_readfirstlane_b32 s28, v18 +; SI-NEXT: v_readfirstlane_b32 s22, v18 ; SI-NEXT: v_mov_b32_e32 v18, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_readfirstlane_b32 s29, v19 -; SI-NEXT: v_readfirstlane_b32 s23, v17 -; SI-NEXT: v_readfirstlane_b32 s22, v18 -; SI-NEXT: v_readfirstlane_b32 s21, v0 -; SI-NEXT: v_readfirstlane_b32 s20, v1 -; SI-NEXT: v_readfirstlane_b32 s19, v2 -; SI-NEXT: v_readfirstlane_b32 s18, v3 -; SI-NEXT: v_readfirstlane_b32 s17, v4 -; SI-NEXT: v_readfirstlane_b32 s16, v5 -; SI-NEXT: v_readfirstlane_b32 s15, v6 -; SI-NEXT: v_readfirstlane_b32 s14, v7 -; SI-NEXT: v_readfirstlane_b32 s13, v8 -; SI-NEXT: v_readfirstlane_b32 s12, v9 -; SI-NEXT: v_readfirstlane_b32 s11, v10 -; SI-NEXT: v_readfirstlane_b32 s10, v11 -; SI-NEXT: v_readfirstlane_b32 s8, v12 +; SI-NEXT: v_writelane_b32 v30, s55, 15 +; SI-NEXT: v_readfirstlane_b32 s23, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v17 +; SI-NEXT: v_readfirstlane_b32 s21, v18 +; SI-NEXT: v_readfirstlane_b32 s18, v0 +; SI-NEXT: v_readfirstlane_b32 s19, v1 +; SI-NEXT: v_readfirstlane_b32 s16, v2 +; SI-NEXT: v_readfirstlane_b32 s17, v3 +; SI-NEXT: v_readfirstlane_b32 s14, v4 +; SI-NEXT: v_readfirstlane_b32 s15, v5 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v8 +; SI-NEXT: v_readfirstlane_b32 s11, v9 +; SI-NEXT: v_readfirstlane_b32 s8, v10 +; SI-NEXT: v_readfirstlane_b32 s9, v11 +; SI-NEXT: v_readfirstlane_b32 s6, v12 ; SI-NEXT: v_readfirstlane_b32 s7, v13 -; SI-NEXT: v_readfirstlane_b32 s6, v14 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v15 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v15 +; SI-NEXT: v_writelane_b32 v30, s64, 16 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: s_lshr_b32 s4, s45, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s44, 16 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s40 +; SI-NEXT: s_lshr_b32 s34, s5, 16 +; SI-NEXT: s_lshr_b32 s35, s7, 16 +; SI-NEXT: s_lshr_b32 s36, s9, 16 +; SI-NEXT: s_lshr_b32 s37, s11, 16 +; SI-NEXT: s_lshr_b32 s38, s13, 16 +; SI-NEXT: s_lshr_b32 s39, s15, 16 +; SI-NEXT: s_lshr_b32 s48, s17, 16 +; SI-NEXT: s_lshr_b32 s49, s19, 16 +; SI-NEXT: s_lshr_b32 s50, s21, 16 +; SI-NEXT: s_lshr_b32 s51, s23, 16 +; SI-NEXT: s_lshr_b32 s52, s25, 16 +; SI-NEXT: s_lshr_b32 s53, s41, 16 +; SI-NEXT: s_lshr_b32 s54, s43, 16 +; SI-NEXT: s_lshr_b32 s55, s45, 16 +; SI-NEXT: s_lshr_b32 s64, s47, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_add_i32 s46, s46, 3 ; SI-NEXT: s_add_i32 s45, s45, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 @@ -8815,303 +8406,216 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_lshr_b32 s5, s41, 16 -; SI-NEXT: s_lshr_b32 s46, s42, 16 -; SI-NEXT: s_lshr_b32 s47, s43, 16 -; SI-NEXT: s_lshr_b32 s56, s44, 16 -; SI-NEXT: s_lshr_b32 s57, s45, 16 -; SI-NEXT: s_lshr_b32 s58, s24, 16 -; SI-NEXT: s_lshr_b32 s59, s25, 16 -; SI-NEXT: s_lshr_b32 s60, s26, 16 -; SI-NEXT: s_lshr_b32 s61, s27, 16 -; SI-NEXT: s_lshr_b32 s62, s28, 16 -; SI-NEXT: s_lshr_b32 s63, s29, 16 -; SI-NEXT: s_lshr_b32 s72, s23, 16 -; SI-NEXT: s_lshr_b32 s73, s22, 16 -; SI-NEXT: s_lshr_b32 s74, s21, 16 -; SI-NEXT: s_lshr_b32 s75, s20, 16 -; SI-NEXT: s_lshr_b32 s76, s19, 16 -; SI-NEXT: s_lshr_b32 s77, s18, 16 -; SI-NEXT: s_lshr_b32 s78, s17, 16 -; SI-NEXT: s_lshr_b32 s79, s16, 16 -; SI-NEXT: s_lshr_b32 s88, s15, 16 -; SI-NEXT: s_lshr_b32 s89, s14, 16 -; SI-NEXT: s_lshr_b32 s90, s13, 16 -; SI-NEXT: s_lshr_b32 s91, s12, 16 -; SI-NEXT: s_lshr_b32 s92, s11, 16 -; SI-NEXT: s_lshr_b32 s93, s10, 16 -; SI-NEXT: s_lshr_b32 s94, s8, 16 -; SI-NEXT: s_lshr_b32 s95, s7, 16 -; SI-NEXT: s_lshr_b32 vcc_lo, s6, 16 -; SI-NEXT: s_lshr_b32 vcc_hi, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v47, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v56, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s41 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v58, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, vcc_hi -; SI-NEXT: v_cvt_f32_f16_e32 v29, vcc_lo -; SI-NEXT: v_cvt_f32_f16_e32 v26, s95 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s94 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 +; SI-NEXT: s_lshr_b32 s34, s5, 16 +; SI-NEXT: s_lshr_b32 s35, s7, 16 +; SI-NEXT: s_lshr_b32 s36, s9, 16 +; SI-NEXT: s_lshr_b32 s37, s11, 16 +; SI-NEXT: s_lshr_b32 s38, s13, 16 +; SI-NEXT: s_lshr_b32 s39, s15, 16 +; SI-NEXT: s_lshr_b32 s48, s17, 16 +; SI-NEXT: s_lshr_b32 s49, s19, 16 +; SI-NEXT: s_lshr_b32 s50, s21, 16 +; SI-NEXT: s_lshr_b32 s51, s23, 16 +; SI-NEXT: s_lshr_b32 s52, s25, 16 +; SI-NEXT: s_lshr_b32 s53, s41, 16 +; SI-NEXT: s_lshr_b32 s54, s43, 16 +; SI-NEXT: s_lshr_b32 s55, s45, 16 +; SI-NEXT: s_lshr_b32 s64, s47, 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 ; SI-NEXT: .LBB17_3: ; %end -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; SI-NEXT: v_or_b32_e32 v2, v56, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v56 -; SI-NEXT: v_or_b32_e32 v5, v5, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v46 -; SI-NEXT: v_or_b32_e32 v7, v7, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_or_b32_e32 v9, v42, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v11, v40, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v60, v1 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_or_b32_e32 v4, v47, v4 -; SI-NEXT: v_or_b32_e32 v6, v45, v6 -; SI-NEXT: v_or_b32_e32 v8, v43, v8 -; SI-NEXT: v_or_b32_e32 v10, v41, v10 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v40 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v13, v54, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v54 -; SI-NEXT: v_or_b32_e32 v15, v52, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 -; SI-NEXT: v_or_b32_e32 v17, v50, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v50 -; SI-NEXT: v_or_b32_e32 v19, v48, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 -; SI-NEXT: v_or_b32_e32 v21, v38, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v38 -; SI-NEXT: v_or_b32_e32 v23, v36, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v36 -; SI-NEXT: v_or_b32_e32 v25, v34, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v34 -; SI-NEXT: v_or_b32_e32 v27, v32, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 -; SI-NEXT: v_or_b32_e32 v12, v55, v12 -; SI-NEXT: v_or_b32_e32 v14, v53, v14 -; SI-NEXT: v_or_b32_e32 v16, v51, v16 -; SI-NEXT: v_or_b32_e32 v18, v49, v18 -; SI-NEXT: v_or_b32_e32 v20, v39, v20 -; SI-NEXT: v_or_b32_e32 v22, v37, v22 -; SI-NEXT: v_or_b32_e32 v24, v35, v24 -; SI-NEXT: v_or_b32_e32 v26, v33, v26 -; SI-NEXT: v_or_b32_e32 v28, v31, v28 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: s_lshl_b32 s27, s30, 16 +; SI-NEXT: s_and_b32 s29, s46, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s47, 0xffff +; SI-NEXT: s_lshl_b32 s46, s64, 16 +; SI-NEXT: s_or_b32 s29, s29, s46 +; SI-NEXT: s_lshl_b32 s46, s94, 16 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_or_b32 s44, s44, s46 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_lshl_b32 s46, s55, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_lshl_b32 s46, s92, 16 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s42, s42, s46 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s46, s54, 16 +; SI-NEXT: s_or_b32 s43, s43, s46 +; SI-NEXT: s_lshl_b32 s46, s90, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s46 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s46, s53, 16 +; SI-NEXT: s_or_b32 s41, s41, s46 +; SI-NEXT: s_lshl_b32 s46, s88, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s46 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s46, s52, 16 +; SI-NEXT: s_or_b32 s25, s25, s46 +; SI-NEXT: s_lshl_b32 s46, s78, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s46 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s46, s51, 16 +; SI-NEXT: s_or_b32 s23, s23, s46 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s46, s76, 16 +; SI-NEXT: s_or_b32 s20, s20, s46 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s46, s50, 16 +; SI-NEXT: s_or_b32 s21, s21, s46 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s46, s74, 16 +; SI-NEXT: s_or_b32 s18, s18, s46 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s49, 16 +; SI-NEXT: s_or_b32 s19, s19, s46 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s46, s72, 16 +; SI-NEXT: s_or_b32 s16, s16, s46 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s46, s48, 16 +; SI-NEXT: s_or_b32 s17, s17, s46 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s46, s62, 16 +; SI-NEXT: s_or_b32 s14, s14, s46 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s46, s39, 16 +; SI-NEXT: s_or_b32 s15, s15, s46 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s46, s60, 16 +; SI-NEXT: s_or_b32 s12, s12, s46 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s46, s38, 16 +; SI-NEXT: s_or_b32 s13, s13, s46 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s46, s58, 16 +; SI-NEXT: s_or_b32 s10, s10, s46 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s46, s37, 16 +; SI-NEXT: s_or_b32 s11, s11, s46 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s46, s56, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s46 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s46, s36, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s35, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s34, 16 +; SI-NEXT: s_or_b32 s9, s9, s46 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s42 +; SI-NEXT: v_mov_b32_e32 v5, s43 +; SI-NEXT: v_mov_b32_e32 v6, s40 +; SI-NEXT: v_mov_b32_e32 v7, s41 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_mov_b32_e32 v18, s14 +; SI-NEXT: v_mov_b32_e32 v19, s15 +; SI-NEXT: v_mov_b32_e32 v20, s12 +; SI-NEXT: v_mov_b32_e32 v21, s13 +; SI-NEXT: v_mov_b32_e32 v22, s10 +; SI-NEXT: v_mov_b32_e32 v23, s11 +; SI-NEXT: v_mov_b32_e32 v24, s8 +; SI-NEXT: v_mov_b32_e32 v25, s9 +; SI-NEXT: v_mov_b32_e32 v26, s6 +; SI-NEXT: v_mov_b32_e32 v27, s7 +; SI-NEXT: v_mov_b32_e32 v28, s4 +; SI-NEXT: v_mov_b32_e32 v29, s5 +; SI-NEXT: v_readlane_b32 s64, v30, 16 +; SI-NEXT: v_readlane_b32 s55, v30, 15 +; SI-NEXT: v_readlane_b32 s54, v30, 14 +; SI-NEXT: v_readlane_b32 s53, v30, 13 +; SI-NEXT: v_readlane_b32 s52, v30, 12 +; SI-NEXT: v_readlane_b32 s51, v30, 11 +; SI-NEXT: v_readlane_b32 s50, v30, 10 +; SI-NEXT: v_readlane_b32 s49, v30, 9 +; SI-NEXT: v_readlane_b32 s48, v30, 8 +; SI-NEXT: v_readlane_b32 s39, v30, 7 +; SI-NEXT: v_readlane_b32 s38, v30, 6 +; SI-NEXT: v_readlane_b32 s37, v30, 5 +; SI-NEXT: v_readlane_b32 s36, v30, 4 +; SI-NEXT: v_readlane_b32 s35, v30, 3 +; SI-NEXT: v_readlane_b32 s34, v30, 2 +; SI-NEXT: v_readlane_b32 s31, v30, 1 +; SI-NEXT: v_readlane_b32 s30, v30, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v30i32_to_v60f16_scalar: @@ -9945,236 +9449,284 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v51, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_mov_b32_e32 v40, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_mov_b32_e32 v41, v12 +; SI-NEXT: v_mov_b32_e32 v42, v11 +; SI-NEXT: v_mov_b32_e32 v43, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v45, v8 +; SI-NEXT: v_mov_b32_e32 v46, v7 +; SI-NEXT: v_mov_b32_e32 v47, v6 +; SI-NEXT: v_mov_b32_e32 v56, v5 +; SI-NEXT: v_mov_b32_e32 v57, v4 +; SI-NEXT: v_mov_b32_e32 v58, v3 +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v60 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v41 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -10226,157 +9778,20 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v20, v42, v20 -; SI-NEXT: v_or_b32_e32 v21, v40, v21 -; SI-NEXT: v_or_b32_e32 v22, v54, v22 -; SI-NEXT: v_or_b32_e32 v23, v52, v23 -; SI-NEXT: v_or_b32_e32 v24, v50, v24 -; SI-NEXT: v_or_b32_e32 v25, v48, v25 -; SI-NEXT: v_or_b32_e32 v26, v38, v26 -; SI-NEXT: v_or_b32_e32 v27, v36, v27 -; SI-NEXT: v_or_b32_e32 v28, v34, v28 -; SI-NEXT: v_or_b32_e32 v29, v32, v29 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v44, v19 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: .LBB18_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -10389,207 +9804,182 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v40 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v47 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v63 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -10597,74 +9987,95 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v41 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v55 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v49 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 @@ -11511,84 +10922,37 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-LABEL: bitcast_v60f16_to_v30i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v36, v11 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v9 +; SI-NEXT: v_mov_b32_e32 v39, v8 +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v5 +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v52, v3 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v1 +; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -11605,576 +10969,373 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v33 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v35 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v55 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v59 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 -; SI-NEXT: v_or_b32_e32 v2, v57, v2 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v4, v40, v4 -; SI-NEXT: v_or_b32_e32 v5, v53, v5 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_or_b32_e32 v6, v52, v6 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_or_b32_e32 v7, v49, v7 -; SI-NEXT: v_or_b32_e32 v8, v48, v8 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_or_b32_e32 v9, v38, v9 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_mov_b32_e32 v33, v63 -; SI-NEXT: v_mov_b32_e32 v60, v34 -; SI-NEXT: v_or_b32_e32 v11, v34, v11 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v34, v45 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v48 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v38 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v37 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v43 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v63 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v40 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v59 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v58 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v56 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v39 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v45 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v36 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v42 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v41 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v33 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 @@ -12200,27 +11361,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v58, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v60, v34 -; SI-NEXT: v_mov_b32_e32 v30, v58 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_mov_b32_e32 v33, v63 ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v60f16_to_v30i32_scalar: @@ -19324,617 +18465,265 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v49, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v52, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v41, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v27 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v25 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB32_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_mov_b32_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v49, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v52, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v41, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_mov_b32_e32 v59, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_mov_b32_e32 v34, v27 -; SI-NEXT: v_mov_b32_e32 v32, v29 -; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v60, v25 -; SI-NEXT: v_mov_b32_e32 v57, v26 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v41 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v12, v12, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 +; SI-NEXT: v_or_b32_e32 v16, v16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v18, v18, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v36 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v37 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v63 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v58 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v59 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v57 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v49 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v53 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v20, v20, v34 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v33 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_or_b32_e32 v24, v24, v32 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v51 +; SI-NEXT: v_or_b32_e32 v26, v26, v31 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v50 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_or_b32_e32 v7, v7, v52 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v13, v13, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v37 +; SI-NEXT: v_or_b32_e32 v17, v17, v36 +; SI-NEXT: v_or_b32_e32 v19, v19, v35 +; SI-NEXT: v_or_b32_e32 v21, v21, v34 +; SI-NEXT: v_or_b32_e32 v23, v23, v33 +; SI-NEXT: v_or_b32_e32 v25, v25, v32 +; SI-NEXT: v_or_b32_e32 v27, v27, v31 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -20554,21 +19343,21 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_mov_b32_e32 v19, s16 -; SI-NEXT: v_mov_b32_e32 v32, s17 -; SI-NEXT: v_mov_b32_e32 v31, s18 -; SI-NEXT: v_mov_b32_e32 v48, s19 +; SI-NEXT: v_mov_b32_e32 v24, s16 +; SI-NEXT: v_mov_b32_e32 v25, s17 +; SI-NEXT: v_mov_b32_e32 v28, s18 +; SI-NEXT: v_mov_b32_e32 v29, s19 +; SI-NEXT: v_mov_b32_e32 v26, s20 +; SI-NEXT: v_mov_b32_e32 v27, s21 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v23, s23 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_mov_b32_e32 v21, s25 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_mov_b32_e32 v19, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v49, s20 -; SI-NEXT: v_mov_b32_e32 v39, s21 -; SI-NEXT: v_mov_b32_e32 v38, s22 -; SI-NEXT: v_mov_b32_e32 v37, s23 -; SI-NEXT: v_mov_b32_e32 v36, s24 -; SI-NEXT: v_mov_b32_e32 v35, s25 -; SI-NEXT: v_mov_b32_e32 v17, s26 -; SI-NEXT: v_mov_b32_e32 v34, s27 ; SI-NEXT: v_mov_b32_e32 v16, s28 -; SI-NEXT: v_mov_b32_e32 v33, s29 +; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -20587,475 +19376,206 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v34 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v39 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v31 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v11 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v10 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v4 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v0 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[40:41], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_lshr_b64 v[42:43], v[8:9], 16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v33 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[31:32], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[43:44], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[44:45], v[4:5], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[45:46], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v46, v30 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[30:31], v[0:1], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 +; SI-NEXT: v_lshr_b64 v[31:32], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; SI-NEXT: v_lshr_b64 v[50:51], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_lshr_b64 v[30:31], v[14:15], 16 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[40:41], v[12:13], 16 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[41:42], v[10:11], 16 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[42:43], v[8:9], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v5 +; SI-NEXT: v_lshr_b64 v[30:31], v[0:1], 16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: v_lshr_b64 v[43:44], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v32 -; SI-NEXT: v_add_f32_e32 v27, 1.0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v48 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v38 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v37 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v35 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v34, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[44:45], v[4:5], 16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[31:32], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[45:46], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v25 ; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v32, v24, v32 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v33, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v34 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v28 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v61 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v56 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v26 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v44 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v62 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v55 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v59 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v51 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v57 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v47 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v50 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v53 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v41 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v34, v25, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_or_b32_e32 v35, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v36 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v36, v25, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v56 +; SI-NEXT: v_or_b32_e32 v37, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v38 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v38, v22, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_or_b32_e32 v39, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v48, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v63 +; SI-NEXT: v_or_b32_e32 v49, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v50, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_or_b32_e32 v51, v18, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_or_b32_e32 v52, v16, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; SI-NEXT: v_or_b32_e32 v53, v16, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_mov_b32_e32 v3, v35 +; SI-NEXT: v_mov_b32_e32 v4, v36 +; SI-NEXT: v_mov_b32_e32 v5, v37 +; SI-NEXT: v_mov_b32_e32 v6, v38 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: v_mov_b32_e32 v8, v48 +; SI-NEXT: v_mov_b32_e32 v9, v49 +; SI-NEXT: v_mov_b32_e32 v10, v50 +; SI-NEXT: v_mov_b32_e32 v11, v51 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -21072,102 +19592,59 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_mov_b32_e32 v2, v34 +; SI-NEXT: v_mov_b32_e32 v12, v52 +; SI-NEXT: v_mov_b32_e32 v13, v53 +; SI-NEXT: v_mov_b32_e32 v14, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v32 +; SI-NEXT: v_mov_b32_e32 v1, v33 +; SI-NEXT: v_mov_b32_e32 v15, v31 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; kill: killed $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; kill: killed $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v30f32_to_v60f16_scalar: @@ -22087,316 +20564,156 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v51, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_mov_b32_e32 v40, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_mov_b32_e32 v41, v12 +; SI-NEXT: v_mov_b32_e32 v42, v11 +; SI-NEXT: v_mov_b32_e32 v43, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v45, v8 +; SI-NEXT: v_mov_b32_e32 v46, v7 +; SI-NEXT: v_mov_b32_e32 v47, v6 +; SI-NEXT: v_mov_b32_e32 v56, v5 +; SI-NEXT: v_mov_b32_e32 v57, v4 +; SI-NEXT: v_mov_b32_e32 v58, v3 +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v60 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v20, v42, v20 -; SI-NEXT: v_or_b32_e32 v21, v40, v21 -; SI-NEXT: v_or_b32_e32 v22, v54, v22 -; SI-NEXT: v_or_b32_e32 v23, v52, v23 -; SI-NEXT: v_or_b32_e32 v24, v50, v24 -; SI-NEXT: v_or_b32_e32 v25, v48, v25 -; SI-NEXT: v_or_b32_e32 v26, v38, v26 -; SI-NEXT: v_or_b32_e32 v27, v36, v27 -; SI-NEXT: v_or_b32_e32 v28, v34, v28 -; SI-NEXT: v_or_b32_e32 v29, v32, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v41 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 @@ -22404,121 +20721,192 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v44, v19 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: .LBB34_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -22531,207 +20919,182 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v40 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v47 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v63 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -22739,74 +21102,95 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v41 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v55 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v49 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 @@ -23653,84 +22037,37 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-LABEL: bitcast_v60f16_to_v30f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v36, v11 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v9 +; SI-NEXT: v_mov_b32_e32 v39, v8 +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v5 +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v52, v3 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v1 +; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -23747,576 +22084,373 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v33 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v35 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v55 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v59 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 -; SI-NEXT: v_or_b32_e32 v2, v57, v2 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v4, v40, v4 -; SI-NEXT: v_or_b32_e32 v5, v53, v5 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_or_b32_e32 v6, v52, v6 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_or_b32_e32 v7, v49, v7 -; SI-NEXT: v_or_b32_e32 v8, v48, v8 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_or_b32_e32 v9, v38, v9 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_mov_b32_e32 v33, v63 -; SI-NEXT: v_mov_b32_e32 v60, v34 -; SI-NEXT: v_or_b32_e32 v11, v34, v11 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v34, v45 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v48 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v38 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v37 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v43 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v63 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v40 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v59 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v58 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v56 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v39 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v45 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v36 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v42 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v41 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v33 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 @@ -24342,27 +22476,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v58, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v60, v34 -; SI-NEXT: v_mov_b32_e32 v30, v58 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_mov_b32_e32 v33, v63 ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v60f16_to_v30f32_scalar: @@ -30528,265 +28642,89 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v48, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v51, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v40, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v27 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v25 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 @@ -30795,7 +28733,6 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 @@ -30814,330 +28751,156 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 ; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 ; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 -; SI-NEXT: v_mov_b32_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v48, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v51, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v40, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_mov_b32_e32 v59, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_mov_b32_e32 v34, v27 -; SI-NEXT: v_mov_b32_e32 v32, v29 -; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v60, v25 -; SI-NEXT: v_mov_b32_e32 v57, v26 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v40 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v40 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v12, v12, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 +; SI-NEXT: v_or_b32_e32 v16, v16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v18, v18, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v40 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v36 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v37 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v63 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v58 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v59 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v57 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v49 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v53 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v20, v20, v34 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v33 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_or_b32_e32 v24, v24, v32 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v26, v26, v31 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v50 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_or_b32_e32 v7, v7, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v48 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v13, v13, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v37 +; SI-NEXT: v_or_b32_e32 v17, v17, v36 +; SI-NEXT: v_or_b32_e32 v19, v19, v35 +; SI-NEXT: v_or_b32_e32 v21, v21, v34 +; SI-NEXT: v_or_b32_e32 v23, v23, v33 +; SI-NEXT: v_or_b32_e32 v25, v25, v32 +; SI-NEXT: v_or_b32_e32 v27, v27, v31 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -31802,485 +29565,341 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-LABEL: bitcast_v15i64_to_v60f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v30, s30, 0 +; SI-NEXT: v_writelane_b32 v30, s31, 1 +; SI-NEXT: v_writelane_b32 v30, s34, 2 +; SI-NEXT: v_writelane_b32 v30, s35, 3 +; SI-NEXT: v_writelane_b32 v30, s36, 4 +; SI-NEXT: v_writelane_b32 v30, s37, 5 +; SI-NEXT: v_writelane_b32 v30, s38, 6 +; SI-NEXT: v_writelane_b32 v30, s39, 7 +; SI-NEXT: v_writelane_b32 v30, s48, 8 +; SI-NEXT: v_writelane_b32 v30, s49, 9 +; SI-NEXT: v_writelane_b32 v30, s50, 10 ; SI-NEXT: v_mov_b32_e32 v17, s16 ; SI-NEXT: v_mov_b32_e32 v18, s17 +; SI-NEXT: v_writelane_b32 v30, s51, 11 ; SI-NEXT: v_mov_b32_e32 v19, s18 -; SI-NEXT: v_readfirstlane_b32 s40, v17 +; SI-NEXT: v_readfirstlane_b32 s46, v17 ; SI-NEXT: v_mov_b32_e32 v17, s19 -; SI-NEXT: v_readfirstlane_b32 s43, v18 +; SI-NEXT: v_readfirstlane_b32 s47, v18 ; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_readfirstlane_b32 s41, v19 +; SI-NEXT: v_writelane_b32 v30, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s44, v19 ; SI-NEXT: v_mov_b32_e32 v19, s21 -; SI-NEXT: v_readfirstlane_b32 s44, v17 +; SI-NEXT: v_readfirstlane_b32 s45, v17 ; SI-NEXT: v_mov_b32_e32 v17, s22 ; SI-NEXT: v_readfirstlane_b32 s42, v18 ; SI-NEXT: v_mov_b32_e32 v18, s23 -; SI-NEXT: v_readfirstlane_b32 s45, v19 +; SI-NEXT: v_writelane_b32 v30, s53, 13 +; SI-NEXT: v_readfirstlane_b32 s43, v19 ; SI-NEXT: v_mov_b32_e32 v19, s24 -; SI-NEXT: v_readfirstlane_b32 s24, v17 +; SI-NEXT: v_readfirstlane_b32 s40, v17 ; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_readfirstlane_b32 s46, v18 +; SI-NEXT: v_readfirstlane_b32 s41, v18 ; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_readfirstlane_b32 s25, v19 +; SI-NEXT: v_writelane_b32 v30, s54, 14 +; SI-NEXT: v_readfirstlane_b32 s24, v19 ; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_readfirstlane_b32 s27, v17 +; SI-NEXT: v_readfirstlane_b32 s25, v17 ; SI-NEXT: v_mov_b32_e32 v17, s28 -; SI-NEXT: v_readfirstlane_b32 s26, v18 +; SI-NEXT: v_readfirstlane_b32 s22, v18 ; SI-NEXT: v_mov_b32_e32 v18, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_readfirstlane_b32 s28, v19 -; SI-NEXT: v_readfirstlane_b32 s22, v17 -; SI-NEXT: v_readfirstlane_b32 s23, v18 -; SI-NEXT: v_readfirstlane_b32 s20, v0 -; SI-NEXT: v_readfirstlane_b32 s21, v1 -; SI-NEXT: v_readfirstlane_b32 s18, v2 -; SI-NEXT: v_readfirstlane_b32 s19, v3 -; SI-NEXT: v_readfirstlane_b32 s16, v4 -; SI-NEXT: v_readfirstlane_b32 s17, v5 -; SI-NEXT: v_readfirstlane_b32 s14, v6 -; SI-NEXT: v_readfirstlane_b32 s15, v7 -; SI-NEXT: v_readfirstlane_b32 s12, v8 -; SI-NEXT: v_readfirstlane_b32 s13, v9 -; SI-NEXT: v_readfirstlane_b32 s10, v10 -; SI-NEXT: v_readfirstlane_b32 s11, v11 -; SI-NEXT: v_readfirstlane_b32 s7, v12 -; SI-NEXT: v_readfirstlane_b32 s8, v13 -; SI-NEXT: v_readfirstlane_b32 s6, v14 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v15 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v30, s55, 15 +; SI-NEXT: v_readfirstlane_b32 s23, v19 +; SI-NEXT: v_readfirstlane_b32 s20, v17 +; SI-NEXT: v_readfirstlane_b32 s21, v18 +; SI-NEXT: v_readfirstlane_b32 s18, v0 +; SI-NEXT: v_readfirstlane_b32 s19, v1 +; SI-NEXT: v_readfirstlane_b32 s16, v2 +; SI-NEXT: v_readfirstlane_b32 s17, v3 +; SI-NEXT: v_readfirstlane_b32 s14, v4 +; SI-NEXT: v_readfirstlane_b32 s15, v5 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v8 +; SI-NEXT: v_readfirstlane_b32 s11, v9 +; SI-NEXT: v_readfirstlane_b32 s8, v10 +; SI-NEXT: v_readfirstlane_b32 s9, v11 +; SI-NEXT: v_readfirstlane_b32 s6, v12 +; SI-NEXT: v_readfirstlane_b32 s7, v13 +; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: s_and_b64 s[26:27], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v15 +; SI-NEXT: v_writelane_b32 v30, s64, 16 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_lshr_b32 s4, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_lshr_b32 s4, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_lshr_b32 s4, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_lshr_b32 s4, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_lshr_b32 s4, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_lshr_b32 s4, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_lshr_b32 s4, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_lshr_b32 s4, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_lshr_b32 s4, s46, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: s_lshr_b32 s4, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: s_lshr_b32 s4, s45, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: s_lshr_b32 s4, s44, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 -; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s40 +; SI-NEXT: s_lshr_b32 s34, s5, 16 +; SI-NEXT: s_lshr_b32 s35, s7, 16 +; SI-NEXT: s_lshr_b32 s36, s9, 16 +; SI-NEXT: s_lshr_b32 s37, s11, 16 +; SI-NEXT: s_lshr_b32 s38, s13, 16 +; SI-NEXT: s_lshr_b32 s39, s15, 16 +; SI-NEXT: s_lshr_b32 s48, s17, 16 +; SI-NEXT: s_lshr_b32 s49, s19, 16 +; SI-NEXT: s_lshr_b32 s50, s21, 16 +; SI-NEXT: s_lshr_b32 s51, s23, 16 +; SI-NEXT: s_lshr_b32 s52, s25, 16 +; SI-NEXT: s_lshr_b32 s53, s41, 16 +; SI-NEXT: s_lshr_b32 s54, s43, 16 +; SI-NEXT: s_lshr_b32 s55, s45, 16 +; SI-NEXT: s_lshr_b32 s64, s47, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: s_add_u32 s4, s40, 3 -; SI-NEXT: s_addc_u32 s5, s43, 0 -; SI-NEXT: s_lshr_b32 s29, s4, 16 -; SI-NEXT: s_lshr_b32 s40, s5, 16 -; SI-NEXT: s_add_u32 s41, s41, 3 -; SI-NEXT: s_addc_u32 s43, s44, 0 -; SI-NEXT: s_lshr_b32 s44, s41, 16 -; SI-NEXT: s_lshr_b32 s47, s43, 16 -; SI-NEXT: s_add_u32 s42, s42, 3 -; SI-NEXT: s_addc_u32 s45, s45, 0 -; SI-NEXT: s_lshr_b32 s56, s42, 16 -; SI-NEXT: s_lshr_b32 s57, s45, 16 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s46, s46, 0 -; SI-NEXT: s_lshr_b32 s58, s24, 16 -; SI-NEXT: s_lshr_b32 s59, s46, 16 -; SI-NEXT: s_add_u32 s25, s25, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_lshr_b32 s60, s25, 16 -; SI-NEXT: s_lshr_b32 s61, s27, 16 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s28, s28, 0 -; SI-NEXT: s_lshr_b32 s62, s26, 16 -; SI-NEXT: s_lshr_b32 s63, s28, 16 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_lshr_b32 s72, s22, 16 -; SI-NEXT: s_lshr_b32 s73, s23, 16 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_lshr_b32 s74, s20, 16 -; SI-NEXT: s_lshr_b32 s75, s21, 16 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_lshr_b32 s76, s18, 16 -; SI-NEXT: s_lshr_b32 s77, s19, 16 -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_lshr_b32 s78, s16, 16 -; SI-NEXT: s_lshr_b32 s79, s17, 16 -; SI-NEXT: s_add_u32 s14, s14, 3 -; SI-NEXT: s_addc_u32 s15, s15, 0 -; SI-NEXT: s_lshr_b32 s88, s14, 16 -; SI-NEXT: s_lshr_b32 s89, s15, 16 -; SI-NEXT: s_add_u32 s12, s12, 3 -; SI-NEXT: s_addc_u32 s13, s13, 0 -; SI-NEXT: s_lshr_b32 s90, s12, 16 -; SI-NEXT: s_lshr_b32 s91, s13, 16 -; SI-NEXT: s_add_u32 s10, s10, 3 -; SI-NEXT: s_addc_u32 s11, s11, 0 -; SI-NEXT: s_lshr_b32 s92, s10, 16 -; SI-NEXT: s_lshr_b32 s93, s11, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_lshr_b32 s94, s7, 16 -; SI-NEXT: s_lshr_b32 s95, s8, 16 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_lshr_b32 vcc_lo, s6, 16 -; SI-NEXT: s_lshr_b32 vcc_hi, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s45 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v47, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s43 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v56, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, vcc_hi -; SI-NEXT: v_cvt_f32_f16_e32 v29, vcc_lo -; SI-NEXT: v_cvt_f32_f16_e32 v26, s95 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s94 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s90 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s89 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s88 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s79 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s78 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s77 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s76 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s75 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s74 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s73 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s72 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s63 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s62 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s60 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s59 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s56 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s40 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s29 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 +; SI-NEXT: s_add_u32 s44, s44, 3 +; SI-NEXT: s_addc_u32 s45, s45, 0 +; SI-NEXT: s_add_u32 s46, s46, 3 +; SI-NEXT: s_addc_u32 s47, s47, 0 +; SI-NEXT: s_lshr_b32 s34, s5, 16 +; SI-NEXT: s_lshr_b32 s35, s7, 16 +; SI-NEXT: s_lshr_b32 s36, s9, 16 +; SI-NEXT: s_lshr_b32 s37, s11, 16 +; SI-NEXT: s_lshr_b32 s38, s13, 16 +; SI-NEXT: s_lshr_b32 s39, s15, 16 +; SI-NEXT: s_lshr_b32 s48, s17, 16 +; SI-NEXT: s_lshr_b32 s49, s19, 16 +; SI-NEXT: s_lshr_b32 s50, s21, 16 +; SI-NEXT: s_lshr_b32 s51, s23, 16 +; SI-NEXT: s_lshr_b32 s52, s25, 16 +; SI-NEXT: s_lshr_b32 s53, s41, 16 +; SI-NEXT: s_lshr_b32 s54, s43, 16 +; SI-NEXT: s_lshr_b32 s55, s45, 16 +; SI-NEXT: s_lshr_b32 s64, s47, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; SI-NEXT: v_or_b32_e32 v2, v56, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v56 -; SI-NEXT: v_or_b32_e32 v5, v5, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v46 -; SI-NEXT: v_or_b32_e32 v7, v7, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_or_b32_e32 v9, v42, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v11, v40, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v60, v1 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 -; SI-NEXT: v_or_b32_e32 v4, v47, v4 -; SI-NEXT: v_or_b32_e32 v6, v45, v6 -; SI-NEXT: v_or_b32_e32 v8, v43, v8 -; SI-NEXT: v_or_b32_e32 v10, v41, v10 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v40 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v13, v54, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v54 -; SI-NEXT: v_or_b32_e32 v15, v52, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 -; SI-NEXT: v_or_b32_e32 v17, v50, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v50 -; SI-NEXT: v_or_b32_e32 v19, v48, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 -; SI-NEXT: v_or_b32_e32 v21, v38, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v38 -; SI-NEXT: v_or_b32_e32 v23, v36, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v36 -; SI-NEXT: v_or_b32_e32 v25, v34, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v34 -; SI-NEXT: v_or_b32_e32 v27, v32, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 -; SI-NEXT: v_or_b32_e32 v12, v55, v12 -; SI-NEXT: v_or_b32_e32 v14, v53, v14 -; SI-NEXT: v_or_b32_e32 v16, v51, v16 -; SI-NEXT: v_or_b32_e32 v18, v49, v18 -; SI-NEXT: v_or_b32_e32 v20, v39, v20 -; SI-NEXT: v_or_b32_e32 v22, v37, v22 -; SI-NEXT: v_or_b32_e32 v24, v35, v24 -; SI-NEXT: v_or_b32_e32 v26, v33, v26 -; SI-NEXT: v_or_b32_e32 v28, v31, v28 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: s_lshl_b32 s27, s30, 16 +; SI-NEXT: s_and_b32 s29, s46, 0xffff +; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s29, s47, 0xffff +; SI-NEXT: s_lshl_b32 s46, s64, 16 +; SI-NEXT: s_or_b32 s29, s29, s46 +; SI-NEXT: s_lshl_b32 s46, s94, 16 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_or_b32 s44, s44, s46 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_lshl_b32 s46, s55, 16 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_lshl_b32 s46, s92, 16 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s42, s42, s46 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s46, s54, 16 +; SI-NEXT: s_or_b32 s43, s43, s46 +; SI-NEXT: s_lshl_b32 s46, s90, 16 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s46 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s46, s53, 16 +; SI-NEXT: s_or_b32 s41, s41, s46 +; SI-NEXT: s_lshl_b32 s46, s88, 16 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s46 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s46, s52, 16 +; SI-NEXT: s_or_b32 s25, s25, s46 +; SI-NEXT: s_lshl_b32 s46, s78, 16 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s46 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s46, s51, 16 +; SI-NEXT: s_or_b32 s23, s23, s46 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s46, s76, 16 +; SI-NEXT: s_or_b32 s20, s20, s46 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s46, s50, 16 +; SI-NEXT: s_or_b32 s21, s21, s46 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s46, s74, 16 +; SI-NEXT: s_or_b32 s18, s18, s46 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s49, 16 +; SI-NEXT: s_or_b32 s19, s19, s46 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s46, s72, 16 +; SI-NEXT: s_or_b32 s16, s16, s46 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_lshl_b32 s46, s48, 16 +; SI-NEXT: s_or_b32 s17, s17, s46 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s46, s62, 16 +; SI-NEXT: s_or_b32 s14, s14, s46 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s46, s39, 16 +; SI-NEXT: s_or_b32 s15, s15, s46 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s46, s60, 16 +; SI-NEXT: s_or_b32 s12, s12, s46 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s46, s38, 16 +; SI-NEXT: s_or_b32 s13, s13, s46 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s46, s58, 16 +; SI-NEXT: s_or_b32 s10, s10, s46 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s46, s37, 16 +; SI-NEXT: s_or_b32 s11, s11, s46 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s46, s56, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s8, s8, s46 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s46, s36, 16 +; SI-NEXT: s_or_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s28, s35, 16 +; SI-NEXT: s_or_b32 s4, s4, s26 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s26, s34, 16 +; SI-NEXT: s_or_b32 s9, s9, s46 +; SI-NEXT: s_or_b32 s7, s7, s28 +; SI-NEXT: s_or_b32 s5, s5, s26 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_mov_b32_e32 v1, s29 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s42 +; SI-NEXT: v_mov_b32_e32 v5, s43 +; SI-NEXT: v_mov_b32_e32 v6, s40 +; SI-NEXT: v_mov_b32_e32 v7, s41 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_mov_b32_e32 v18, s14 +; SI-NEXT: v_mov_b32_e32 v19, s15 +; SI-NEXT: v_mov_b32_e32 v20, s12 +; SI-NEXT: v_mov_b32_e32 v21, s13 +; SI-NEXT: v_mov_b32_e32 v22, s10 +; SI-NEXT: v_mov_b32_e32 v23, s11 +; SI-NEXT: v_mov_b32_e32 v24, s8 +; SI-NEXT: v_mov_b32_e32 v25, s9 +; SI-NEXT: v_mov_b32_e32 v26, s6 +; SI-NEXT: v_mov_b32_e32 v27, s7 +; SI-NEXT: v_mov_b32_e32 v28, s4 +; SI-NEXT: v_mov_b32_e32 v29, s5 +; SI-NEXT: v_readlane_b32 s64, v30, 16 +; SI-NEXT: v_readlane_b32 s55, v30, 15 +; SI-NEXT: v_readlane_b32 s54, v30, 14 +; SI-NEXT: v_readlane_b32 s53, v30, 13 +; SI-NEXT: v_readlane_b32 s52, v30, 12 +; SI-NEXT: v_readlane_b32 s51, v30, 11 +; SI-NEXT: v_readlane_b32 s50, v30, 10 +; SI-NEXT: v_readlane_b32 s49, v30, 9 +; SI-NEXT: v_readlane_b32 s48, v30, 8 +; SI-NEXT: v_readlane_b32 s39, v30, 7 +; SI-NEXT: v_readlane_b32 s38, v30, 6 +; SI-NEXT: v_readlane_b32 s37, v30, 5 +; SI-NEXT: v_readlane_b32 s36, v30, 4 +; SI-NEXT: v_readlane_b32 s35, v30, 3 +; SI-NEXT: v_readlane_b32 s34, v30, 2 +; SI-NEXT: v_readlane_b32 s31, v30, 1 +; SI-NEXT: v_readlane_b32 s30, v30, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v15i64_to_v60f16_scalar: @@ -33114,236 +30733,284 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v51, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_mov_b32_e32 v40, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_mov_b32_e32 v41, v12 +; SI-NEXT: v_mov_b32_e32 v42, v11 +; SI-NEXT: v_mov_b32_e32 v43, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v45, v8 +; SI-NEXT: v_mov_b32_e32 v46, v7 +; SI-NEXT: v_mov_b32_e32 v47, v6 +; SI-NEXT: v_mov_b32_e32 v56, v5 +; SI-NEXT: v_mov_b32_e32 v57, v4 +; SI-NEXT: v_mov_b32_e32 v58, v3 +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v60 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v41 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -33395,157 +31062,20 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v20, v42, v20 -; SI-NEXT: v_or_b32_e32 v21, v40, v21 -; SI-NEXT: v_or_b32_e32 v22, v54, v22 -; SI-NEXT: v_or_b32_e32 v23, v52, v23 -; SI-NEXT: v_or_b32_e32 v24, v50, v24 -; SI-NEXT: v_or_b32_e32 v25, v48, v25 -; SI-NEXT: v_or_b32_e32 v26, v38, v26 -; SI-NEXT: v_or_b32_e32 v27, v36, v27 -; SI-NEXT: v_or_b32_e32 v28, v34, v28 -; SI-NEXT: v_or_b32_e32 v29, v32, v29 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v44, v19 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -33558,207 +31088,182 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v40 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v47 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v63 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -33766,74 +31271,95 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v41 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v55 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v49 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 @@ -34680,84 +32206,37 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-LABEL: bitcast_v60f16_to_v15i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v36, v11 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v9 +; SI-NEXT: v_mov_b32_e32 v39, v8 +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v5 +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v52, v3 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v1 +; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -34774,576 +32253,373 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v33 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v35 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v55 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v59 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 -; SI-NEXT: v_or_b32_e32 v2, v57, v2 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v4, v40, v4 -; SI-NEXT: v_or_b32_e32 v5, v53, v5 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_or_b32_e32 v6, v52, v6 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_or_b32_e32 v7, v49, v7 -; SI-NEXT: v_or_b32_e32 v8, v48, v8 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_or_b32_e32 v9, v38, v9 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_mov_b32_e32 v33, v63 -; SI-NEXT: v_mov_b32_e32 v60, v34 -; SI-NEXT: v_or_b32_e32 v11, v34, v11 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v34, v45 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v48 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v38 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v37 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v43 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v63 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v40 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v59 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v58 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v56 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v39 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v45 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v36 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v42 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v41 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v33 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 @@ -35369,27 +32645,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v58, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v60, v34 -; SI-NEXT: v_mov_b32_e32 v30, v58 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_mov_b32_e32 v33, v63 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v60f16_to_v15i64_scalar: @@ -40584,273 +37840,96 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v48, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v51, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v40, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v30, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v58 -; SI-NEXT: v_mov_b32_e32 v58, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v59 -; SI-NEXT: v_mov_b32_e32 v59, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v62 -; SI-NEXT: v_mov_b32_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v63 -; SI-NEXT: v_mov_b32_e32 v63, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v56 -; SI-NEXT: v_mov_b32_e32 v56, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v57 -; SI-NEXT: v_mov_b32_e32 v57, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v60 -; SI-NEXT: v_mov_b32_e32 v60, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v61 -; SI-NEXT: v_mov_b32_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v32 -; SI-NEXT: v_mov_b32_e32 v32, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v2 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB52_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v21 -; SI-NEXT: v_add_f64 v[30:31], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[35:36], v[2:3], 1.0 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -40858,314 +37937,154 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_alignbit_b32 v30, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v31, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v33, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v34, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v35, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v36, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v39, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v48, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v51, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v40, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v43, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v0 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v1 ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v49 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v44 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v40 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v40 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v46 +; SI-NEXT: v_or_b32_e32 v12, v12, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v45 +; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 +; SI-NEXT: v_or_b32_e32 v16, v16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_or_b32_e32 v18, v18, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v40 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v48 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v63 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v55 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v53 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v50 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v39 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v57 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v60 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v58 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v62 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v20, v20, v34 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v33 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_or_b32_e32 v24, v24, v32 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; SI-NEXT: v_or_b32_e32 v26, v26, v31 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v50 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_or_b32_e32 v7, v7, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v48 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v13, v13, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v37 +; SI-NEXT: v_or_b32_e32 v17, v17, v36 +; SI-NEXT: v_or_b32_e32 v19, v19, v35 +; SI-NEXT: v_or_b32_e32 v21, v21, v34 +; SI-NEXT: v_or_b32_e32 v23, v23, v33 +; SI-NEXT: v_or_b32_e32 v25, v25, v32 +; SI-NEXT: v_or_b32_e32 v27, v27, v31 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v15f64_to_v60f16: @@ -41754,14 +38673,14 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_mov_b32_e32 v22, s16 -; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v24, s16 +; SI-NEXT: v_mov_b32_e32 v25, s17 ; SI-NEXT: v_mov_b32_e32 v28, s18 ; SI-NEXT: v_mov_b32_e32 v29, s19 ; SI-NEXT: v_mov_b32_e32 v26, s20 ; SI-NEXT: v_mov_b32_e32 v27, s21 -; SI-NEXT: v_mov_b32_e32 v24, s22 -; SI-NEXT: v_mov_b32_e32 v25, s23 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v23, s23 ; SI-NEXT: v_mov_b32_e32 v20, s24 ; SI-NEXT: v_mov_b32_e32 v21, s25 ; SI-NEXT: v_mov_b32_e32 v18, s26 @@ -41787,463 +38706,191 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_lshr_b64 v[40:41], v[12:13], 16 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[41:42], v[10:11], 16 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v30 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v10 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v8 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v8 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v30 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v3 +; SI-NEXT: v_lshr_b64 v[42:43], v[8:9], 16 ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v0 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v18 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v30 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v24 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v13 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_lshr_b64 v[43:44], v[6:7], 16 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[44:45], v[4:5], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 +; SI-NEXT: v_lshr_b64 v[30:31], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[45:46], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v25 +; SI-NEXT: v_lshr_b64 v[54:55], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[24:25], 16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_lshr_b64 v[40:41], v[12:13], 16 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 -; SI-NEXT: v_add_f64 v[39:40], v[22:23], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshr_b64 v[41:42], v[10:11], 16 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 -; SI-NEXT: v_mov_b32_e32 v56, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; SI-NEXT: v_lshr_b64 v[42:43], v[8:9], 16 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_lshr_b64 v[43:44], v[6:7], 16 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; SI-NEXT: v_add_f64 v[41:42], v[26:27], 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[44:45], v[4:5], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v22 -; SI-NEXT: v_mov_b32_e32 v31, v15 -; SI-NEXT: v_mov_b32_e32 v59, v13 -; SI-NEXT: v_mov_b32_e32 v58, v14 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[30:31], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[45:46], v[2:3], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v25 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v42 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v60 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v10, v61 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v18, v62 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v20, v56 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v22, v35 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v50 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v32, v24, v32 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v58 +; SI-NEXT: v_or_b32_e32 v33, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v34 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v34, v25, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_or_b32_e32 v35, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v36 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v36, v25, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v56 +; SI-NEXT: v_or_b32_e32 v37, v24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v38 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v38, v22, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_or_b32_e32 v39, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v48, v20, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v50 +; SI-NEXT: v_or_b32_e32 v49, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v31 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v50, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_or_b32_e32 v51, v18, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; SI-NEXT: v_or_b32_e32 v52, v16, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; SI-NEXT: v_or_b32_e32 v53, v16, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v46 +; SI-NEXT: v_or_b32_e32 v30, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_mov_b32_e32 v2, v34 +; SI-NEXT: v_mov_b32_e32 v3, v35 +; SI-NEXT: v_mov_b32_e32 v4, v36 +; SI-NEXT: v_mov_b32_e32 v5, v37 +; SI-NEXT: v_mov_b32_e32 v6, v38 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: v_mov_b32_e32 v8, v48 +; SI-NEXT: v_mov_b32_e32 v9, v49 +; SI-NEXT: v_mov_b32_e32 v10, v50 +; SI-NEXT: v_mov_b32_e32 v11, v51 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -42260,100 +38907,59 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v28, v54 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_mov_b32_e32 v12, v52 +; SI-NEXT: v_mov_b32_e32 v13, v53 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_mov_b32_e32 v14, v30 +; SI-NEXT: v_mov_b32_e32 v15, v31 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, v32 +; SI-NEXT: v_mov_b32_e32 v1, v33 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v15f64_to_v60f16_scalar: @@ -43243,316 +39849,156 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_mov_b32_e32 v51, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_mov_b32_e32 v53, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_mov_b32_e32 v40, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_mov_b32_e32 v41, v12 +; SI-NEXT: v_mov_b32_e32 v42, v11 +; SI-NEXT: v_mov_b32_e32 v43, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v45, v8 +; SI-NEXT: v_mov_b32_e32 v46, v7 +; SI-NEXT: v_mov_b32_e32 v47, v6 +; SI-NEXT: v_mov_b32_e32 v56, v5 +; SI-NEXT: v_mov_b32_e32 v57, v4 +; SI-NEXT: v_mov_b32_e32 v58, v3 +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v60 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v61 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_or_b32_e32 v2, v46, v2 -; SI-NEXT: v_or_b32_e32 v20, v42, v20 -; SI-NEXT: v_or_b32_e32 v21, v40, v21 -; SI-NEXT: v_or_b32_e32 v22, v54, v22 -; SI-NEXT: v_or_b32_e32 v23, v52, v23 -; SI-NEXT: v_or_b32_e32 v24, v50, v24 -; SI-NEXT: v_or_b32_e32 v25, v48, v25 -; SI-NEXT: v_or_b32_e32 v26, v38, v26 -; SI-NEXT: v_or_b32_e32 v27, v36, v27 -; SI-NEXT: v_or_b32_e32 v28, v34, v28 -; SI-NEXT: v_or_b32_e32 v29, v32, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v41 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 @@ -43560,121 +40006,192 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v44, v19 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: .LBB54_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -43687,207 +40204,182 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v40 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v47 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v63 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -43895,74 +40387,95 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v41 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v55 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v54 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v49 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 -; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 @@ -44809,84 +41322,37 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-LABEL: bitcast_v60f16_to_v15f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v36, v11 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v9 +; SI-NEXT: v_mov_b32_e32 v39, v8 +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v5 +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v52, v3 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v1 +; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: s_lshr_b32 s40, s17, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s28, 16 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s11, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s20, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b32 s41, s18, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s16, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -44903,576 +41369,373 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: s_lshr_b32 s15, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v33 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, s15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 -; SI-NEXT: s_lshr_b32 s13, s20, 16 -; SI-NEXT: s_lshr_b32 s14, s19, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: s_lshr_b32 s11, s22, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 -; SI-NEXT: s_lshr_b32 s9, s24, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: s_lshr_b32 s7, s26, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: s_lshr_b32 s4, s29, 16 -; SI-NEXT: s_lshr_b32 s5, s28, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v35 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v55 ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v59 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 -; SI-NEXT: v_or_b32_e32 v2, v57, v2 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v4, v40, v4 -; SI-NEXT: v_or_b32_e32 v5, v53, v5 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_or_b32_e32 v6, v52, v6 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_or_b32_e32 v7, v49, v7 -; SI-NEXT: v_or_b32_e32 v8, v48, v8 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_or_b32_e32 v9, v38, v9 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_or_b32_e32 v10, v36, v10 -; SI-NEXT: v_mov_b32_e32 v33, v63 -; SI-NEXT: v_mov_b32_e32 v60, v34 -; SI-NEXT: v_or_b32_e32 v11, v34, v11 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s44, s42, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: s_and_b32 s44, s18, 0xffff +; SI-NEXT: s_lshl_b32 s45, s41, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_lshl_b32 s46, s40, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 +; SI-NEXT: s_or_b32 s45, s45, s46 +; SI-NEXT: s_and_b32 s46, s20, 0xffff +; SI-NEXT: s_lshl_b32 s47, s15, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_and_b32 s47, s21, 0xffff +; SI-NEXT: s_lshl_b32 s56, s14, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s56, s22, 0xffff +; SI-NEXT: s_lshl_b32 s57, s13, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_lshl_b32 s58, s12, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: s_or_b32 s57, s57, s58 +; SI-NEXT: s_and_b32 s58, s24, 0xffff +; SI-NEXT: s_lshl_b32 s59, s11, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v34, v45 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v58, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_and_b32 s59, s25, 0xffff +; SI-NEXT: s_lshl_b32 s60, s10, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_or_b32 s59, s59, s60 +; SI-NEXT: s_and_b32 s60, s26, 0xffff +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_lshl_b32 s62, s8, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s62, s28, 0xffff +; SI-NEXT: s_lshl_b32 s63, s7, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: s_or_b32 s62, s62, s63 +; SI-NEXT: s_and_b32 s63, s29, 0xffff +; SI-NEXT: s_lshl_b32 s72, s6, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_or_b32 s63, s63, s72 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s44 +; SI-NEXT: v_mov_b32_e32 v3, s45 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v5, s47 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, s58 +; SI-NEXT: v_mov_b32_e32 v9, s59 +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v11, s61 +; SI-NEXT: v_mov_b32_e32 v12, s62 +; SI-NEXT: v_mov_b32_e32 v13, s63 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v61 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s23 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s26 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s29 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s8 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v48 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v38 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v37 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v43 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v63 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v40 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v59 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v58 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v56 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v39 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v45 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v36 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v42 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v41 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v33 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 @@ -45498,27 +41761,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v58, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v60, v34 -; SI-NEXT: v_mov_b32_e32 v30, v58 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v43, v42 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v61, v36 -; SI-NEXT: v_mov_b32_e32 v33, v63 ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v60f16_to_v15f64_scalar: @@ -46254,6 +42497,7 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v60i16_to_v60f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -46270,834 +42514,805 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; kill: killed $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB56_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v43 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v60 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v52 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v51 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v50 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v3 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v62 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v43 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v40 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v53 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v59 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v58 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v57 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v56 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v47 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v45 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v54 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v43 +; SI-NEXT: v_or_b32_e32 v45, v1, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v45, v31, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_or_b32_e32 v63, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v63, v3, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v61, v1, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v61, v5, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v59, v1, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v59, v7, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: .LBB56_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB56_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v57, v1, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v57, v53, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v57 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v60 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v56, v1, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v56, v32, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v46, v1, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v46, v33, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v62 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v44, v1, v9 +; SI-NEXT: v_alignbit_b32 v1, v44, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v42, v1, v47 +; SI-NEXT: v_alignbit_b32 v1, v42, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v41, v1, v49 +; SI-NEXT: v_alignbit_b32 v1, v41, v36, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v55, v1, v30 +; SI-NEXT: v_alignbit_b32 v1, v55, v60, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v9, v1, v58 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_alignbit_b32 v1, v9, v37, 16 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v1, v1, v50 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v1, v62, 16 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v48, v1, v51 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_alignbit_b32 v1, v48, v38, 16 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v1, v39, 16 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v28, v39, v28 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v26, v38, v26 +; SI-NEXT: v_add_i32_e32 v39, vcc, 0x30000, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v26, v51, v26 +; SI-NEXT: v_or_b32_e32 v24, v62, v24 +; SI-NEXT: v_or_b32_e32 v22, v37, v22 +; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v22, v58, v22 +; SI-NEXT: v_or_b32_e32 v20, v60, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v20, v30, v20 +; SI-NEXT: v_or_b32_e32 v18, v36, v18 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v18, v49, v18 +; SI-NEXT: v_or_b32_e32 v16, v35, v16 +; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v16, v47, v16 +; SI-NEXT: v_or_b32_e32 v14, v34, v14 +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v33, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v32, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v50, v24 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_or_b32_e32 v28, v52, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v63, vcc, s6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v0 +; SI-NEXT: v_alignbit_b32 v0, v45, v2, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v63, v4, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v61, v6, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v59, v8, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v57, v10, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v56, v12, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v46, v14, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v44, v16, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v42, v18, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v13 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v41, v20, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v55, v22, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v54, v25, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v24, v26, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v48, v29, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v0, v28, v39, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v26, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v47 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v53 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v25, v27, v25 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v45 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -47114,14 +43329,39 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v60f16: @@ -47730,656 +43970,716 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-LABEL: bitcast_v60i16_to_v60f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v30, s30, 0 +; SI-NEXT: v_writelane_b32 v30, s31, 1 +; SI-NEXT: v_writelane_b32 v30, s34, 2 +; SI-NEXT: v_writelane_b32 v30, s35, 3 +; SI-NEXT: v_writelane_b32 v30, s36, 4 +; SI-NEXT: v_writelane_b32 v30, s37, 5 +; SI-NEXT: v_writelane_b32 v30, s38, 6 +; SI-NEXT: v_writelane_b32 v30, s39, 7 +; SI-NEXT: v_writelane_b32 v30, s48, 8 +; SI-NEXT: v_writelane_b32 v30, s49, 9 +; SI-NEXT: v_writelane_b32 v30, s50, 10 +; SI-NEXT: v_writelane_b32 v30, s51, 11 +; SI-NEXT: v_writelane_b32 v30, s52, 12 +; SI-NEXT: v_writelane_b32 v30, s53, 13 +; SI-NEXT: v_writelane_b32 v30, s54, 14 +; SI-NEXT: v_writelane_b32 v30, s55, 15 +; SI-NEXT: v_writelane_b32 v30, s64, 16 +; SI-NEXT: v_writelane_b32 v30, s65, 17 +; SI-NEXT: v_writelane_b32 v30, s66, 18 +; SI-NEXT: v_writelane_b32 v30, s67, 19 +; SI-NEXT: v_writelane_b32 v30, s68, 20 +; SI-NEXT: v_writelane_b32 v30, s69, 21 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: ; implicit-def: $vgpr31 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v30, s70, 22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v31, s4, 0 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_writelane_b32 v30, s71, 23 +; SI-NEXT: v_writelane_b32 v31, s4, 1 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: v_writelane_b32 v30, s80, 24 +; SI-NEXT: v_writelane_b32 v31, s4, 2 +; SI-NEXT: v_writelane_b32 v30, s81, 25 +; SI-NEXT: v_writelane_b32 v31, s29, 3 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_writelane_b32 v30, s82, 26 +; SI-NEXT: v_writelane_b32 v31, s4, 4 +; SI-NEXT: v_writelane_b32 v30, s83, 27 +; SI-NEXT: v_writelane_b32 v31, s27, 5 +; SI-NEXT: v_writelane_b32 v30, s84, 28 +; SI-NEXT: v_writelane_b32 v31, s25, 6 +; SI-NEXT: v_writelane_b32 v30, s85, 29 +; SI-NEXT: v_writelane_b32 v31, s23, 7 +; SI-NEXT: v_writelane_b32 v30, s86, 30 +; SI-NEXT: v_writelane_b32 v31, s21, 8 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_writelane_b32 v30, s87, 31 +; SI-NEXT: v_writelane_b32 v31, s4, 9 +; SI-NEXT: v_writelane_b32 v30, s96, 32 +; SI-NEXT: v_writelane_b32 v31, s16, 10 +; SI-NEXT: v_writelane_b32 v30, s97, 33 +; SI-NEXT: s_mov_b32 s59, s20 +; SI-NEXT: v_writelane_b32 v31, s18, 11 +; SI-NEXT: v_writelane_b32 v30, s98, 34 +; SI-NEXT: s_mov_b32 s98, s22 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_readfirstlane_b32 s78, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_readfirstlane_b32 s85, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; SI-NEXT: v_readfirstlane_b32 s96, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_readfirstlane_b32 s77, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; SI-NEXT: v_readfirstlane_b32 s39, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_readfirstlane_b32 s97, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_readfirstlane_b32 s51, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_readfirstlane_b32 s87, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_readfirstlane_b32 s75, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_readfirstlane_b32 s49, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: s_lshr_b32 s80, s29, 16 +; SI-NEXT: s_lshr_b32 s69, s27, 16 +; SI-NEXT: s_lshr_b32 s95, s26, 16 +; SI-NEXT: s_lshr_b32 s68, s25, 16 +; SI-NEXT: s_lshr_b32 s38, s24, 16 +; SI-NEXT: s_lshr_b32 s67, s23, 16 +; SI-NEXT: s_lshr_b32 s36, s22, 16 +; SI-NEXT: s_lshr_b32 s66, s21, 16 +; SI-NEXT: s_lshr_b32 s65, s19, 16 +; SI-NEXT: s_lshr_b32 s84, s18, 16 +; SI-NEXT: s_lshr_b32 s64, s17, 16 +; SI-NEXT: s_lshr_b32 s94, s16, 16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: s_lshr_b32 s43, s29, 16 -; SI-NEXT: s_lshr_b32 s42, s28, 16 -; SI-NEXT: s_lshr_b32 s41, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s26, 16 -; SI-NEXT: s_lshr_b32 s15, s25, 16 -; SI-NEXT: s_lshr_b32 s14, s24, 16 -; SI-NEXT: s_lshr_b32 s13, s23, 16 -; SI-NEXT: s_lshr_b32 s12, s22, 16 -; SI-NEXT: s_lshr_b32 s11, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s20, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; SI-NEXT: v_writelane_b32 v31, s59, 12 +; SI-NEXT: v_writelane_b32 v30, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s29, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_readfirstlane_b32 s37, v1 +; SI-NEXT: v_readfirstlane_b32 s55, v0 +; SI-NEXT: v_readfirstlane_b32 s86, v17 +; SI-NEXT: v_readfirstlane_b32 s58, v18 +; SI-NEXT: v_readfirstlane_b32 s30, v19 +; SI-NEXT: v_readfirstlane_b32 s35, v15 +; SI-NEXT: v_readfirstlane_b32 s83, v14 +; SI-NEXT: v_readfirstlane_b32 s31, v13 +; SI-NEXT: v_readfirstlane_b32 s82, v12 +; SI-NEXT: v_readfirstlane_b32 s91, v11 +; SI-NEXT: v_readfirstlane_b32 s81, v10 +; SI-NEXT: v_readfirstlane_b32 s79, v9 +; SI-NEXT: v_readfirstlane_b32 s99, v8 +; SI-NEXT: v_readfirstlane_b32 s23, v7 +; SI-NEXT: v_readfirstlane_b32 s71, v6 +; SI-NEXT: v_readfirstlane_b32 s89, v5 +; SI-NEXT: v_readfirstlane_b32 s90, v4 +; SI-NEXT: v_readfirstlane_b32 s21, v3 +; SI-NEXT: v_writelane_b32 v31, s98, 13 +; SI-NEXT: v_writelane_b32 v31, s58, 14 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v16, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s26 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s6 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s17 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s28 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s7 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s23 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s29 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s8 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s40 -; SI-NEXT: v_mov_b32_e32 v34, v20 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v20 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s10 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s11 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s43 -; SI-NEXT: v_mov_b32_e32 v35, v21 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v21 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v36, v22 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_mov_b32_e32 v37, v23 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v38, v24 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 -; SI-NEXT: v_mov_b32_e32 v39, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 -; SI-NEXT: v_mov_b32_e32 v48, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v7 -; SI-NEXT: v_mov_b32_e32 v49, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 -; SI-NEXT: v_mov_b32_e32 v50, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 -; SI-NEXT: v_mov_b32_e32 v51, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 -; SI-NEXT: v_mov_b32_e32 v52, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v11 -; SI-NEXT: v_mov_b32_e32 v53, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v12 -; SI-NEXT: v_mov_b32_e32 v33, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 -; SI-NEXT: v_mov_b32_e32 v19, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v14 -; SI-NEXT: v_mov_b32_e32 v21, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 -; SI-NEXT: v_mov_b32_e32 v32, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s64, 16 +; SI-NEXT: s_or_b32 s47, s5, s7 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s7, s65, 16 +; SI-NEXT: s_or_b32 s45, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v31, 8 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s66, 16 +; SI-NEXT: s_or_b32 s43, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v31, 7 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s67, 16 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v31, 6 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s68, 16 +; SI-NEXT: s_or_b32 s15, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v31, 5 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s69, 16 +; SI-NEXT: s_or_b32 s13, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v31, 3 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s80, 16 +; SI-NEXT: s_or_b32 s11, s5, s7 +; SI-NEXT: s_and_b32 s5, s37, 0xffff +; SI-NEXT: s_lshl_b32 s7, s90, 16 +; SI-NEXT: s_or_b32 s9, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v31, 2 +; SI-NEXT: v_readlane_b32 s4, v31, 9 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s7, s71, 16 +; SI-NEXT: s_lshl_b32 s42, s4, 16 +; SI-NEXT: v_readlane_b32 s4, v31, 4 +; SI-NEXT: s_or_b32 s7, s5, s7 +; SI-NEXT: v_readlane_b32 s5, v31, 0 +; SI-NEXT: v_writelane_b32 v31, s21, 21 +; SI-NEXT: v_writelane_b32 v31, s77, 22 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s56, s99, 16 +; SI-NEXT: v_writelane_b32 v31, s90, 23 +; SI-NEXT: s_or_b32 s5, s5, s56 +; SI-NEXT: s_and_b32 s56, s75, 0xffff +; SI-NEXT: s_lshl_b32 s57, s81, 16 +; SI-NEXT: v_writelane_b32 v31, s89, 24 +; SI-NEXT: s_or_b32 vcc_hi, s56, s57 +; SI-NEXT: s_and_b32 s56, s51, 0xffff +; SI-NEXT: s_lshl_b32 s57, s82, 16 +; SI-NEXT: v_writelane_b32 v31, s99, 25 +; SI-NEXT: v_writelane_b32 v31, s97, 26 +; SI-NEXT: s_mov_b32 s97, s49 +; SI-NEXT: s_or_b32 s49, s56, s57 +; SI-NEXT: s_and_b32 s56, s39, 0xffff +; SI-NEXT: s_lshl_b32 s57, s83, 16 +; SI-NEXT: v_writelane_b32 v31, s51, 27 +; SI-NEXT: s_or_b32 s51, s56, s57 +; SI-NEXT: s_and_b32 s56, s96, 0xffff +; SI-NEXT: s_lshl_b32 s57, s30, 16 +; SI-NEXT: s_or_b32 s53, s56, s57 +; SI-NEXT: s_and_b32 s56, s78, 0xffff +; SI-NEXT: s_lshl_b32 s57, s86, 16 +; SI-NEXT: s_lshl_b32 s46, s94, 16 +; SI-NEXT: s_mov_b32 s27, s67 +; SI-NEXT: s_mov_b32 s67, s66 +; SI-NEXT: s_mov_b32 s66, s65 +; SI-NEXT: s_mov_b32 s65, s64 +; SI-NEXT: s_mov_b32 s64, s55 +; SI-NEXT: s_or_b32 s55, s56, s57 +; SI-NEXT: s_and_b32 s56, s16, 0xffff +; SI-NEXT: s_lshl_b32 s44, s84, 16 +; SI-NEXT: s_mov_b32 s70, s69 +; SI-NEXT: s_mov_b32 s74, s68 +; SI-NEXT: s_or_b32 s60, s56, s46 +; SI-NEXT: s_lshr_b64 s[68:69], s[46:47], 16 +; SI-NEXT: s_and_b32 s46, s18, 0xffff +; SI-NEXT: s_lshl_b32 s8, s21, 16 +; SI-NEXT: s_or_b32 s72, s46, s44 +; SI-NEXT: s_lshr_b64 s[20:21], s[44:45], 16 +; SI-NEXT: s_and_b32 s44, s59, 0xffff +; SI-NEXT: s_lshl_b32 s40, s36, 16 +; SI-NEXT: s_lshl_b32 s10, s4, 16 +; SI-NEXT: s_lshl_b32 s4, s23, 16 +; SI-NEXT: s_mov_b32 s90, s23 +; SI-NEXT: s_or_b32 s62, s44, s42 +; SI-NEXT: s_lshr_b64 s[22:23], s[42:43], 16 +; SI-NEXT: s_and_b32 s42, s98, 0xffff +; SI-NEXT: s_lshl_b32 s14, s38, 16 +; SI-NEXT: s_lshl_b32 s52, s35, 16 +; SI-NEXT: s_mov_b32 s25, s17 +; SI-NEXT: s_mov_b32 s17, s35 +; SI-NEXT: s_lshl_b32 s54, s58, 16 +; SI-NEXT: s_or_b32 s58, s42, s40 +; SI-NEXT: s_lshr_b64 s[34:35], s[40:41], 16 +; SI-NEXT: s_and_b32 s40, s24, 0xffff +; SI-NEXT: s_lshl_b32 s12, s95, 16 +; SI-NEXT: s_or_b32 s56, s40, s14 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 16 +; SI-NEXT: s_and_b32 s14, s26, 0xffff +; SI-NEXT: s_lshl_b32 s6, s89, 16 +; SI-NEXT: s_or_b32 s46, s14, s12 +; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 16 +; SI-NEXT: s_and_b32 s12, s28, 0xffff +; SI-NEXT: s_or_b32 s44, s12, s10 +; SI-NEXT: s_lshr_b64 s[92:93], s[10:11], 16 +; SI-NEXT: s_and_b32 s10, s64, 0xffff +; SI-NEXT: s_or_b32 s42, s10, s8 +; SI-NEXT: s_lshr_b64 s[98:99], s[8:9], 16 +; SI-NEXT: s_and_b32 s8, s29, 0xffff +; SI-NEXT: s_mov_b32 s59, s41 +; SI-NEXT: s_or_b32 s40, s8, s6 +; SI-NEXT: s_mov_b32 s41, s7 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v31, s6, 15 +; SI-NEXT: v_writelane_b32 v31, s7, 16 +; SI-NEXT: v_readlane_b32 s6, v31, 1 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_mov_b32 s57, s15 +; SI-NEXT: s_or_b32 s14, s6, s4 +; SI-NEXT: s_mov_b32 s15, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; SI-NEXT: s_lshl_b32 vcc_lo, s79, 16 +; SI-NEXT: v_writelane_b32 v31, s4, 17 +; SI-NEXT: v_writelane_b32 v31, s5, 18 +; SI-NEXT: s_and_b32 s4, s97, 0xffff +; SI-NEXT: s_lshr_b64 s[6:7], vcc, 16 +; SI-NEXT: s_lshl_b32 s48, s91, 16 +; SI-NEXT: s_or_b32 s12, s4, vcc_lo +; SI-NEXT: v_writelane_b32 v31, s6, 19 +; SI-NEXT: s_and_b32 s4, s87, 0xffff +; SI-NEXT: s_mov_b32 s73, s45 +; SI-NEXT: s_mov_b32 s45, s11 +; SI-NEXT: v_writelane_b32 v31, s7, 20 +; SI-NEXT: s_or_b32 s10, s4, s48 +; SI-NEXT: s_mov_b32 s11, s49 +; SI-NEXT: s_lshr_b64 s[48:49], s[48:49], 16 +; SI-NEXT: s_mov_b32 s49, s97 +; SI-NEXT: v_readlane_b32 s97, v31, 26 +; SI-NEXT: s_lshl_b32 s50, s31, 16 +; SI-NEXT: v_readlane_b32 s77, v31, 22 +; SI-NEXT: s_and_b32 s4, s97, 0xffff +; SI-NEXT: s_or_b32 s8, s4, s50 +; SI-NEXT: s_and_b32 s4, s77, 0xffff +; SI-NEXT: s_or_b32 s6, s4, s52 +; SI-NEXT: s_and_b32 s4, s85, 0xffff +; SI-NEXT: s_mov_b32 s63, s43 +; SI-NEXT: s_mov_b32 s43, s9 +; SI-NEXT: s_mov_b32 s9, s51 +; SI-NEXT: s_lshr_b64 s[50:51], s[50:51], 16 +; SI-NEXT: s_or_b32 s4, s4, s54 +; SI-NEXT: s_mov_b32 s5, s55 +; SI-NEXT: s_lshr_b64 s[54:55], s[54:55], 16 +; SI-NEXT: s_mov_b32 s61, s47 +; SI-NEXT: s_mov_b32 s47, s13 +; SI-NEXT: s_mov_b32 s16, s29 +; SI-NEXT: s_mov_b32 s13, vcc_hi +; SI-NEXT: s_mov_b32 s23, s90 +; SI-NEXT: v_readlane_b32 s99, v31, 25 +; SI-NEXT: v_readlane_b32 s89, v31, 24 +; SI-NEXT: v_readlane_b32 s90, v31, 23 +; SI-NEXT: v_readlane_b32 s21, v31, 21 +; SI-NEXT: v_readlane_b32 s51, v31, 27 +; SI-NEXT: s_mov_b32 s7, s53 +; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 16 +; SI-NEXT: s_mov_b32 s18, s68 +; SI-NEXT: s_mov_b32 s55, s64 +; SI-NEXT: s_mov_b32 s64, s65 +; SI-NEXT: s_mov_b32 s65, s66 +; SI-NEXT: s_mov_b32 s66, s67 +; SI-NEXT: s_mov_b32 s67, s27 +; SI-NEXT: s_mov_b32 s68, s74 +; SI-NEXT: s_mov_b32 s69, s70 +; SI-NEXT: s_mov_b32 s35, s17 +; SI-NEXT: s_mov_b32 s17, s25 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s16 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s6 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s21 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s17 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 -; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s18 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s8 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s23 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s19 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 +; SI-NEXT: s_add_i32 s4, s85, 3 +; SI-NEXT: v_readlane_b32 s5, v31, 14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s5, s78, 3 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s6, s86, 16 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s6, s77, 3 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s7, s35, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s7, s96, 3 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s8, s30, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s8, s97, 3 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s9, s31, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_add_i32 s9, s39, 3 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s10, s83, 16 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_add_i32 s10, s87, 3 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s11, s91, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s11, s51, 3 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s12, s82, 16 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_add_i32 s12, s49, 3 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s13, s79, 16 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_add_i32 s13, s75, 3 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s14, s81, 16 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: v_readlane_b32 s14, v31, 1 ; SI-NEXT: s_add_i32 s14, s14, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s10 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s25 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s11 -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s40 -; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s15, s23, 16 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: v_readlane_b32 s15, v31, 0 ; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s15 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s27 -; SI-NEXT: s_add_i32 s42, s42, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v16, s41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s42 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v21 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v52 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s40, s99, 16 +; SI-NEXT: s_or_b32 s15, s40, s15 +; SI-NEXT: s_add_i32 s40, s16, 3 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_lshl_b32 s41, s89, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 2 +; SI-NEXT: s_or_b32 s40, s41, s40 +; SI-NEXT: s_add_i32 s41, s16, 3 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s42, s71, 16 +; SI-NEXT: s_or_b32 s41, s42, s41 +; SI-NEXT: s_add_i32 s42, s55, 3 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_lshl_b32 s43, s21, 16 +; SI-NEXT: s_or_b32 s42, s43, s42 +; SI-NEXT: s_add_i32 s43, s37, 3 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s44, s90, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_readlane_b32 s16, v31, 4 +; SI-NEXT: s_or_b32 s43, s44, s43 +; SI-NEXT: s_and_b32 s28, s28, 0xffff +; SI-NEXT: s_lshl_b32 s44, s16, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 3 +; SI-NEXT: s_or_b32 s28, s44, s28 +; SI-NEXT: s_add_i32 s29, s16, 3 +; SI-NEXT: s_add_i32 s44, s28, 0x30000 +; SI-NEXT: s_and_b32 s28, s29, 0xffff +; SI-NEXT: s_lshl_b32 s29, s80, 16 +; SI-NEXT: s_or_b32 s28, s29, s28 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v35, s28 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_add_i32 s45, s28, 0x30000 +; SI-NEXT: s_and_b32 s26, s26, 0xffff +; SI-NEXT: s_lshl_b32 s28, s95, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 5 +; SI-NEXT: s_or_b32 s26, s28, s26 +; SI-NEXT: s_add_i32 s27, s16, 3 +; SI-NEXT: s_add_i32 s46, s26, 0x30000 +; SI-NEXT: s_and_b32 s26, s27, 0xffff +; SI-NEXT: s_lshl_b32 s27, s69, 16 +; SI-NEXT: s_or_b32 s26, s27, s26 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s47, s26, 0x30000 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s26, s38, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 6 +; SI-NEXT: s_or_b32 s24, s26, s24 +; SI-NEXT: s_add_i32 s25, s16, 3 +; SI-NEXT: s_add_i32 s56, s24, 0x30000 +; SI-NEXT: s_and_b32 s24, s25, 0xffff +; SI-NEXT: s_lshl_b32 s25, s68, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 13 +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: s_add_i32 s22, s16, 3 +; SI-NEXT: s_add_i32 s57, s24, 0x30000 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s24, s36, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 7 +; SI-NEXT: s_or_b32 s22, s24, s22 +; SI-NEXT: s_add_i32 s23, s16, 3 +; SI-NEXT: s_add_i32 s58, s22, 0x30000 +; SI-NEXT: s_and_b32 s22, s23, 0xffff +; SI-NEXT: s_lshl_b32 s23, s67, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 12 +; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: s_add_i32 s20, s16, 3 +; SI-NEXT: v_readlane_b32 s16, v31, 9 +; SI-NEXT: s_add_i32 s59, s22, 0x30000 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s22, s16, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 8 +; SI-NEXT: s_or_b32 s20, s22, s20 +; SI-NEXT: s_add_i32 s21, s16, 3 +; SI-NEXT: s_add_i32 s62, s20, 0x30000 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_lshl_b32 s21, s66, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 11 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_add_i32 s18, s16, 3 +; SI-NEXT: s_add_i32 s63, s20, 0x30000 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s20, s84, 16 +; SI-NEXT: s_or_b32 s18, s20, s18 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s72, s18, 0x30000 +; SI-NEXT: s_and_b32 s18, s19, 0xffff +; SI-NEXT: s_lshl_b32 s19, s65, 16 +; SI-NEXT: v_readlane_b32 s16, v31, 10 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s73, s18, 0x30000 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s94, 16 +; SI-NEXT: s_or_b32 s16, s18, s16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s60, s16, 0x30000 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s64, 16 +; SI-NEXT: s_add_i32 s40, s40, 0x30000 +; SI-NEXT: s_add_i32 s41, s41, 0x30000 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s61, s16, 0x30000 +; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 16 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: v_writelane_b32 v31, s16, 15 +; SI-NEXT: v_writelane_b32 v31, s17, 16 +; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 16 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s42, s42, 0x30000 +; SI-NEXT: s_add_i32 s43, s43, 0x30000 +; SI-NEXT: v_writelane_b32 v31, s16, 17 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_lshr_b64 s[98:99], s[42:43], 16 +; SI-NEXT: v_writelane_b32 v31, s17, 18 +; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[18:19], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[72:73], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[44:45], 16 +; SI-NEXT: v_writelane_b32 v31, s16, 19 +; SI-NEXT: s_lshr_b64 s[48:49], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[50:51], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s64, s61, 16 +; SI-NEXT: s_lshr_b32 s65, s73, 16 +; SI-NEXT: s_lshr_b32 s66, s63, 16 +; SI-NEXT: s_lshr_b32 s67, s59, 16 +; SI-NEXT: s_lshr_b32 s68, s57, 16 +; SI-NEXT: s_lshr_b32 s69, s47, 16 +; SI-NEXT: s_lshr_b32 s80, s45, 16 +; SI-NEXT: s_lshr_b32 s90, s43, 16 +; SI-NEXT: s_lshr_b32 s71, s41, 16 +; SI-NEXT: s_lshr_b32 s99, s15, 16 +; SI-NEXT: s_lshr_b32 s81, s13, 16 +; SI-NEXT: s_lshr_b32 s82, s11, 16 +; SI-NEXT: s_lshr_b32 s83, s9, 16 +; SI-NEXT: s_lshr_b32 s30, s7, 16 +; SI-NEXT: s_lshr_b32 s86, s5, 16 +; SI-NEXT: v_writelane_b32 v31, s17, 20 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v44 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v59 -; SI-NEXT: v_or_b32_e32 v17, v21, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v55 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_or_b32_e32 v18, v21, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v63 -; SI-NEXT: v_or_b32_e32 v19, v33, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_or_b32_e32 v20, v32, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v56 -; SI-NEXT: v_or_b32_e32 v21, v34, v21 -; SI-NEXT: v_or_b32_e32 v22, v32, v22 -; SI-NEXT: v_or_b32_e32 v23, v33, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v60 -; SI-NEXT: v_or_b32_e32 v24, v32, v24 -; SI-NEXT: v_or_b32_e32 v25, v33, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v61 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v26, v32, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v31 -; SI-NEXT: v_or_b32_e32 v27, v33, v27 -; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: v_or_b32_e32 v29, v32, v29 +; SI-NEXT: s_and_b32 s16, s60, 0xffff +; SI-NEXT: s_lshl_b32 s17, s18, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s61, 0xffff +; SI-NEXT: s_lshl_b32 s18, s64, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s72, 0xffff +; SI-NEXT: s_lshl_b32 s19, s20, 16 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s73, 0xffff +; SI-NEXT: s_lshl_b32 s20, s65, 16 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s20, s62, 0xffff +; SI-NEXT: s_lshl_b32 s21, s22, 16 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_and_b32 s21, s63, 0xffff +; SI-NEXT: s_lshl_b32 s22, s66, 16 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_and_b32 s22, s58, 0xffff +; SI-NEXT: s_lshl_b32 s23, s34, 16 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_and_b32 s23, s59, 0xffff +; SI-NEXT: s_lshl_b32 s24, s67, 16 +; SI-NEXT: s_or_b32 s23, s23, s24 +; SI-NEXT: s_and_b32 s24, s56, 0xffff +; SI-NEXT: s_lshl_b32 s25, s76, 16 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_and_b32 s25, s57, 0xffff +; SI-NEXT: s_lshl_b32 s26, s68, 16 +; SI-NEXT: s_or_b32 s25, s25, s26 +; SI-NEXT: s_and_b32 s26, s46, 0xffff +; SI-NEXT: s_lshl_b32 s27, s88, 16 +; SI-NEXT: s_or_b32 s26, s26, s27 +; SI-NEXT: s_and_b32 s27, s47, 0xffff +; SI-NEXT: s_lshl_b32 s28, s69, 16 +; SI-NEXT: s_or_b32 s27, s27, s28 +; SI-NEXT: s_and_b32 s28, s44, 0xffff +; SI-NEXT: s_lshl_b32 s29, s92, 16 +; SI-NEXT: s_or_b32 s28, s28, s29 +; SI-NEXT: s_and_b32 s29, s45, 0xffff +; SI-NEXT: s_lshl_b32 s44, s80, 16 +; SI-NEXT: s_or_b32 s29, s29, s44 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_lshl_b32 s44, s98, 16 +; SI-NEXT: s_or_b32 s42, s42, s44 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s44, s90, 16 +; SI-NEXT: s_or_b32 s43, s43, s44 +; SI-NEXT: v_readlane_b32 s44, v31, 15 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_lshl_b32 s44, s44, 16 +; SI-NEXT: v_readlane_b32 s45, v31, 16 +; SI-NEXT: s_or_b32 s40, s40, s44 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s44, s71, 16 +; SI-NEXT: s_or_b32 s41, s41, s44 +; SI-NEXT: v_readlane_b32 s44, v31, 17 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s44, s44, 16 +; SI-NEXT: v_readlane_b32 s45, v31, 18 +; SI-NEXT: s_or_b32 s14, s14, s44 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s44, s99, 16 +; SI-NEXT: s_or_b32 s15, s15, s44 +; SI-NEXT: v_readlane_b32 s44, v31, 19 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s44, s44, 16 +; SI-NEXT: s_or_b32 s12, s12, s44 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s44, s81, 16 +; SI-NEXT: s_or_b32 s13, s13, s44 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s44, s48, 16 +; SI-NEXT: s_or_b32 s10, s10, s44 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s44, s82, 16 +; SI-NEXT: s_or_b32 s11, s11, s44 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s44, s50, 16 +; SI-NEXT: s_or_b32 s8, s8, s44 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s44, s83, 16 +; SI-NEXT: s_or_b32 s9, s9, s44 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s44, s52, 16 +; SI-NEXT: s_or_b32 s6, s6, s44 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s44, s30, 16 +; SI-NEXT: s_or_b32 s7, s7, s44 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s44, s54, 16 +; SI-NEXT: s_or_b32 s4, s4, s44 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s44, s86, 16 +; SI-NEXT: s_or_b32 s5, s5, s44 +; SI-NEXT: v_readlane_b32 s45, v31, 20 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s42 +; SI-NEXT: v_mov_b32_e32 v15, s43 +; SI-NEXT: v_mov_b32_e32 v16, s40 +; SI-NEXT: v_mov_b32_e32 v17, s41 +; SI-NEXT: v_mov_b32_e32 v18, s14 +; SI-NEXT: v_mov_b32_e32 v19, s15 +; SI-NEXT: v_mov_b32_e32 v20, s12 +; SI-NEXT: v_mov_b32_e32 v21, s13 +; SI-NEXT: v_mov_b32_e32 v22, s10 +; SI-NEXT: v_mov_b32_e32 v23, s11 +; SI-NEXT: v_mov_b32_e32 v24, s8 +; SI-NEXT: v_mov_b32_e32 v25, s9 +; SI-NEXT: v_mov_b32_e32 v26, s6 +; SI-NEXT: v_mov_b32_e32 v27, s7 +; SI-NEXT: v_mov_b32_e32 v28, s4 +; SI-NEXT: v_mov_b32_e32 v29, s5 +; SI-NEXT: v_readlane_b32 s99, v30, 35 +; SI-NEXT: v_readlane_b32 s98, v30, 34 +; SI-NEXT: v_readlane_b32 s97, v30, 33 +; SI-NEXT: v_readlane_b32 s96, v30, 32 +; SI-NEXT: v_readlane_b32 s87, v30, 31 +; SI-NEXT: v_readlane_b32 s86, v30, 30 +; SI-NEXT: v_readlane_b32 s85, v30, 29 +; SI-NEXT: v_readlane_b32 s84, v30, 28 +; SI-NEXT: v_readlane_b32 s83, v30, 27 +; SI-NEXT: v_readlane_b32 s82, v30, 26 +; SI-NEXT: v_readlane_b32 s81, v30, 25 +; SI-NEXT: v_readlane_b32 s80, v30, 24 +; SI-NEXT: v_readlane_b32 s71, v30, 23 +; SI-NEXT: v_readlane_b32 s70, v30, 22 +; SI-NEXT: v_readlane_b32 s69, v30, 21 +; SI-NEXT: v_readlane_b32 s68, v30, 20 +; SI-NEXT: v_readlane_b32 s67, v30, 19 +; SI-NEXT: v_readlane_b32 s66, v30, 18 +; SI-NEXT: v_readlane_b32 s65, v30, 17 +; SI-NEXT: v_readlane_b32 s64, v30, 16 +; SI-NEXT: v_readlane_b32 s55, v30, 15 +; SI-NEXT: v_readlane_b32 s54, v30, 14 +; SI-NEXT: v_readlane_b32 s53, v30, 13 +; SI-NEXT: v_readlane_b32 s52, v30, 12 +; SI-NEXT: v_readlane_b32 s51, v30, 11 +; SI-NEXT: v_readlane_b32 s50, v30, 10 +; SI-NEXT: v_readlane_b32 s49, v30, 9 +; SI-NEXT: v_readlane_b32 s48, v30, 8 +; SI-NEXT: v_readlane_b32 s39, v30, 7 +; SI-NEXT: v_readlane_b32 s38, v30, 6 +; SI-NEXT: v_readlane_b32 s37, v30, 5 +; SI-NEXT: v_readlane_b32 s36, v30, 4 +; SI-NEXT: v_readlane_b32 s35, v30, 3 +; SI-NEXT: v_readlane_b32 s34, v30, 2 +; SI-NEXT: v_readlane_b32 s31, v30, 1 +; SI-NEXT: v_readlane_b32 s30, v30, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v53, v57 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: v_mov_b32_e32 v52, v33 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: v_mov_b32_e32 v51, v32 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: v_mov_b32_e32 v50, v30 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: v_mov_b32_e32 v49, v27 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: v_mov_b32_e32 v48, v26 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: v_mov_b32_e32 v39, v25 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: v_mov_b32_e32 v38, v24 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: v_mov_b32_e32 v37, v23 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: v_mov_b32_e32 v36, v22 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: v_mov_b32_e32 v35, v21 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: v_mov_b32_e32 v34, v20 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: v_mov_b32_e32 v33, v19 -; SI-NEXT: v_mov_b32_e32 v32, v31 -; SI-NEXT: v_mov_b32_e32 v21, v29 -; SI-NEXT: v_mov_b32_e32 v19, v28 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; kill: killed $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; kill: killed $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_mov_b32 s16, s29 +; SI-NEXT: v_writelane_b32 v31, s4, 15 +; SI-NEXT: v_writelane_b32 v31, s5, 16 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v31, s4, 17 +; SI-NEXT: v_writelane_b32 v31, s5, 18 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v31, s4, 19 +; SI-NEXT: v_writelane_b32 v31, s5, 20 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v60i16_to_v60f16_scalar: @@ -49334,265 +45634,150 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v60f16_to_v60i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v34 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v36 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v39 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v49 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v50 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v51 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v53 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v54 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v60 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_or_b32_e32 v29, v29, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_or_b32_e32 v27, v27, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_or_b32_e32 v25, v25, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_or_b32_e32 v23, v23, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_or_b32_e32 v21, v21, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_or_b32_e32 v19, v19, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 @@ -49601,51 +45786,51 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_or_b32_e32 v17, v17, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v15, v15, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 ; SI-NEXT: v_or_b32_e32 v13, v13, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 ; SI-NEXT: v_or_b32_e32 v11, v11, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59 ; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 ; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 @@ -49654,68 +45839,68 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 ; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 ; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_or_b32_e32 v9, v9, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_or_b32_e32 v7, v7, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_or_b32_e32 v7, v7, v60 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_or_b32_e32 v5, v5, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v42 ; SI-NEXT: v_or_b32_e32 v3, v3, v60 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v60 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 @@ -49724,148 +45909,147 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: v_or_b32_e32 v0, v0, v59 -; SI-NEXT: v_or_b32_e32 v54, v54, v58 -; SI-NEXT: v_or_b32_e32 v53, v53, v57 -; SI-NEXT: v_or_b32_e32 v51, v51, v56 -; SI-NEXT: v_or_b32_e32 v50, v50, v47 -; SI-NEXT: v_or_b32_e32 v49, v49, v46 -; SI-NEXT: v_or_b32_e32 v39, v39, v45 -; SI-NEXT: v_or_b32_e32 v38, v38, v44 -; SI-NEXT: v_or_b32_e32 v37, v37, v43 -; SI-NEXT: v_or_b32_e32 v36, v36, v42 -; SI-NEXT: v_or_b32_e32 v35, v35, v41 -; SI-NEXT: v_or_b32_e32 v34, v34, v40 -; SI-NEXT: v_or_b32_e32 v32, v32, v55 -; SI-NEXT: v_or_b32_e32 v33, v33, v52 -; SI-NEXT: v_or_b32_e32 v31, v31, v48 -; SI-NEXT: v_alignbit_b32 v59, v1, v59, 16 -; SI-NEXT: v_alignbit_b32 v58, v3, v58, 16 -; SI-NEXT: v_alignbit_b32 v57, v5, v57, 16 -; SI-NEXT: v_alignbit_b32 v56, v7, v56, 16 -; SI-NEXT: v_alignbit_b32 v47, v9, v47, 16 -; SI-NEXT: v_alignbit_b32 v46, v11, v46, 16 -; SI-NEXT: v_alignbit_b32 v45, v13, v45, 16 -; SI-NEXT: v_alignbit_b32 v44, v15, v44, 16 -; SI-NEXT: v_alignbit_b32 v43, v17, v43, 16 -; SI-NEXT: v_alignbit_b32 v42, v19, v42, 16 -; SI-NEXT: v_alignbit_b32 v41, v21, v41, 16 -; SI-NEXT: v_alignbit_b32 v40, v23, v40, 16 -; SI-NEXT: v_alignbit_b32 v55, v25, v55, 16 -; SI-NEXT: v_alignbit_b32 v52, v27, v52, 16 -; SI-NEXT: v_alignbit_b32 v48, v29, v48, 16 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_or_b32_e32 v2, v2, v59 +; SI-NEXT: v_or_b32_e32 v4, v4, v58 +; SI-NEXT: v_or_b32_e32 v6, v6, v57 +; SI-NEXT: v_or_b32_e32 v8, v8, v56 +; SI-NEXT: v_or_b32_e32 v10, v10, v47 +; SI-NEXT: v_or_b32_e32 v12, v12, v46 +; SI-NEXT: v_or_b32_e32 v14, v14, v45 +; SI-NEXT: v_or_b32_e32 v16, v16, v44 +; SI-NEXT: v_or_b32_e32 v18, v18, v41 +; SI-NEXT: v_or_b32_e32 v20, v20, v54 +; SI-NEXT: v_or_b32_e32 v22, v22, v51 +; SI-NEXT: v_or_b32_e32 v24, v24, v48 +; SI-NEXT: v_or_b32_e32 v26, v26, v37 +; SI-NEXT: v_or_b32_e32 v28, v28, v35 +; SI-NEXT: v_alignbit_b32 v30, v1, v30, 16 +; SI-NEXT: v_alignbit_b32 v59, v3, v59, 16 +; SI-NEXT: v_alignbit_b32 v58, v5, v58, 16 +; SI-NEXT: v_alignbit_b32 v57, v7, v57, 16 +; SI-NEXT: v_alignbit_b32 v56, v9, v56, 16 +; SI-NEXT: v_alignbit_b32 v47, v11, v47, 16 +; SI-NEXT: v_alignbit_b32 v46, v13, v46, 16 +; SI-NEXT: v_alignbit_b32 v45, v15, v45, 16 +; SI-NEXT: v_alignbit_b32 v44, v17, v44, 16 +; SI-NEXT: v_alignbit_b32 v41, v19, v41, 16 +; SI-NEXT: v_alignbit_b32 v54, v21, v54, 16 +; SI-NEXT: v_alignbit_b32 v51, v23, v51, 16 +; SI-NEXT: v_alignbit_b32 v48, v25, v48, 16 +; SI-NEXT: v_alignbit_b32 v37, v27, v37, 16 +; SI-NEXT: v_alignbit_b32 v35, v29, v35, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v30 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v42 +; SI-NEXT: v_or_b32_e32 v3, v3, v30 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v58 +; SI-NEXT: v_or_b32_e32 v4, v4, v30 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v40 +; SI-NEXT: v_or_b32_e32 v5, v5, v30 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v30 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v55 +; SI-NEXT: v_or_b32_e32 v7, v7, v30 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v30 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v53 +; SI-NEXT: v_or_b32_e32 v9, v9, v30 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v47 +; SI-NEXT: v_or_b32_e32 v10, v10, v30 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v52 +; SI-NEXT: v_or_b32_e32 v11, v11, v30 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v46 +; SI-NEXT: v_or_b32_e32 v12, v12, v30 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v50 +; SI-NEXT: v_or_b32_e32 v13, v13, v30 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v45 +; SI-NEXT: v_or_b32_e32 v14, v14, v30 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_or_b32_e32 v15, v15, v30 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v44 +; SI-NEXT: v_or_b32_e32 v16, v16, v30 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v39 +; SI-NEXT: v_or_b32_e32 v17, v17, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v30 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v38 +; SI-NEXT: v_or_b32_e32 v19, v19, v30 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v54 +; SI-NEXT: v_or_b32_e32 v20, v20, v30 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v0, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v58 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v57 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v56 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v47 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v46 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v45 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v38 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v44 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v43 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v42 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v41 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v40 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v36 +; SI-NEXT: v_or_b32_e32 v21, v21, v30 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v51 +; SI-NEXT: v_or_b32_e32 v22, v22, v30 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v34 +; SI-NEXT: v_or_b32_e32 v23, v23, v30 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v48 +; SI-NEXT: v_or_b32_e32 v24, v24, v30 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_or_b32_e32 v25, v25, v30 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v37 +; SI-NEXT: v_or_b32_e32 v26, v26, v30 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v24, v24, v32 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; SI-NEXT: v_or_b32_e32 v27, v27, v30 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v2, v2, v54 -; SI-NEXT: v_or_b32_e32 v4, v4, v53 -; SI-NEXT: v_or_b32_e32 v6, v6, v51 -; SI-NEXT: v_or_b32_e32 v8, v8, v50 -; SI-NEXT: v_or_b32_e32 v10, v10, v49 -; SI-NEXT: v_or_b32_e32 v12, v12, v39 -; SI-NEXT: v_or_b32_e32 v14, v14, v38 -; SI-NEXT: v_or_b32_e32 v16, v16, v37 -; SI-NEXT: v_or_b32_e32 v18, v18, v36 -; SI-NEXT: v_or_b32_e32 v20, v20, v35 -; SI-NEXT: v_or_b32_e32 v22, v22, v34 -; SI-NEXT: v_or_b32_e32 v26, v26, v32 -; SI-NEXT: v_or_b32_e32 v28, v28, v31 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 ; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -50477,77 +46661,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-LABEL: bitcast_v60f16_to_v60i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s10 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; SI-NEXT: s_lshr_b32 s40, s19, 16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s40 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: s_lshr_b32 s12, s23, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: s_lshr_b32 s8, s27, 16 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: s_lshr_b32 s13, s22, 16 -; SI-NEXT: s_lshr_b32 s15, s20, 16 -; SI-NEXT: s_lshr_b32 s41, s18, 16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -50564,594 +46677,555 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s41 -; SI-NEXT: s_lshr_b32 s11, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: s_lshr_b32 s42, s17, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; SI-NEXT: s_lshr_b32 s14, s21, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s22 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v15 +; SI-NEXT: v_mov_b32_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s26 -; SI-NEXT: s_lshr_b32 s6, s29, 16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: s_lshr_b32 s7, s28, 16 -; SI-NEXT: s_lshr_b32 s9, s26, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_mov_b32_e32 v28, v10 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v15, v9 +; SI-NEXT: v_mov_b32_e32 v6, v5 +; SI-NEXT: v_mov_b32_e32 v37, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_lshr_b32 s8, s29, 16 +; SI-NEXT: s_lshr_b32 s6, s28, 16 +; SI-NEXT: s_lshr_b32 s10, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s26, 16 +; SI-NEXT: s_lshr_b32 s11, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s24, 16 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: s_lshr_b32 s13, s22, 16 +; SI-NEXT: s_lshr_b32 s40, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s20, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 ; SI-NEXT: s_lshr_b32 s43, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s6 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s43 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v26 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB59_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB59_3 -; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB59_3 +; SI-NEXT: .LBB59_2: +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: .LBB59_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_vccnz .LBB59_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s19 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s15 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s13 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s14 +; SI-NEXT: v_or_b32_e32 v56, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s21 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v30, v2, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; SI-NEXT: v_or_b32_e32 v49, v4, v8 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s23 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s25 +; SI-NEXT: v_or_b32_e32 v40, v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v31 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 +; SI-NEXT: v_or_b32_e32 v53, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s10 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s27 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v35, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s29 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v41 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v46, v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_or_b32_e32 v38, v5, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v54 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v10 -; SI-NEXT: v_mov_b32_e32 v21, v40 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v33 -; SI-NEXT: v_mov_b32_e32 v23, v38 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v22 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v63 -; SI-NEXT: v_mov_b32_e32 v54, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v43 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_or_b32_e32 v5, v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; SI-NEXT: v_mov_b32_e32 v58, v35 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 -; SI-NEXT: v_mov_b32_e32 v35, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v56 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v26 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, s22 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v42 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v10, v18, v39 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v43, v1, v2 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v42, v19, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v24 -; SI-NEXT: v_or_b32_e32 v19, v21, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v27, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v19, v27, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s26 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v50, v23, v42 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v22, v16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s20 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v6 +; SI-NEXT: v_or_b32_e32 v25, v9, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_or_b32_e32 v59, v11, v16 +; SI-NEXT: v_or_b32_e32 v16, v12, v29 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_mov_b32_e32 v8, v55 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v7, v14, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v12, v8 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v14 +; SI-NEXT: v_mov_b32_e32 v9, v56 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v1, v21, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v41 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v7, v5 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 -; SI-NEXT: v_lshr_b64 v[50:51], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: v_mov_b32_e32 v46, v44 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v20 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v20 -; SI-NEXT: v_or_b32_e32 v21, v23, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v37 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v62 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v61, v12, v13 +; SI-NEXT: v_or_b32_e32 v12, v20, v4 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v44, v26, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v28 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v53 -; SI-NEXT: v_or_b32_e32 v23, v25, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v32 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v54 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_or_b32_e32 v53, v37, v2 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v31 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_or_b32_e32 v56, v15, v14 +; SI-NEXT: v_mov_b32_e32 v15, v9 +; SI-NEXT: v_or_b32_e32 v51, v22, v2 +; SI-NEXT: v_mov_b32_e32 v14, v8 +; SI-NEXT: v_or_b32_e32 v11, v27, v58 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v8, v15 +; SI-NEXT: v_or_b32_e32 v13, v17, v45 +; SI-NEXT: v_mov_b32_e32 v17, v43 +; SI-NEXT: v_lshr_b64 v[42:43], v[42:43], 16 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v31 -; SI-NEXT: v_or_b32_e32 v63, v29, v36 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v35 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[20:21], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[26:27], 16 -; SI-NEXT: v_mov_b32_e32 v33, v61 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_mov_b32_e32 v61, v34 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v28 -; SI-NEXT: v_lshr_b64 v[28:29], v[10:11], 16 -; SI-NEXT: v_or_b32_e32 v49, v38, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v58 -; SI-NEXT: v_lshr_b64 v[58:59], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[59:60], v[14:15], 16 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_or_b32_e32 v35, v38, v14 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_or_b32_e32 v36, v36, v0 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 -; SI-NEXT: v_or_b32_e32 v24, v37, v4 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v24, v36, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v52 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: v_or_b32_e32 v55, v37, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v48 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[24:25], v[2:3], 16 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_or_b32_e32 v52, v36, v12 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v48, v37, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v54 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[24:25], v[4:5], 16 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[24:25], v[6:7], 16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[24:25], v[8:9], 16 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v45 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[24:25], v[18:19], 16 -; SI-NEXT: v_lshr_b64 v[44:45], v[62:63], 16 -; SI-NEXT: v_mov_b32_e32 v25, v23 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: v_or_b32_e32 v30, v36, v41 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v30, v38, v18 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_or_b32_e32 v36, v36, v22 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v36, v38, v26 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v30 -; SI-NEXT: v_or_b32_e32 v30, v37, v20 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v18, v26, v24 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v33 +; SI-NEXT: v_or_b32_e32 v20, v28, v60 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v36, v39, v62 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[20:21], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[4:5], 16 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[36:37], v[41:42], 16 -; SI-NEXT: v_lshr_b64 v[39:40], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[22:23], 16 -; SI-NEXT: v_mov_b32_e32 v41, v50 -; SI-NEXT: v_mov_b32_e32 v50, v59 -; SI-NEXT: v_mov_b32_e32 v40, v19 -; SI-NEXT: v_mov_b32_e32 v38, v21 -; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: v_or_b32_e32 v18, v54, v55 +; SI-NEXT: v_lshr_b64 v[26:27], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[24:25], 16 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v53 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v49 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v52 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v35 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v40 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v31 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v38 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v37 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v63 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mov_b32_e32 v18, v30 +; SI-NEXT: v_lshr_b64 v[32:33], v[52:53], 16 +; SI-NEXT: v_mov_b32_e32 v23, v50 +; SI-NEXT: v_lshr_b64 v[29:30], v[58:59], 16 +; SI-NEXT: v_mov_b32_e32 v54, v51 +; SI-NEXT: v_lshr_b64 v[50:51], v[55:56], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v2, v16 +; SI-NEXT: v_mov_b32_e32 v16, v31 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v15, v3 +; SI-NEXT: v_mov_b32_e32 v3, v19 +; SI-NEXT: v_mov_b32_e32 v19, v38 +; SI-NEXT: v_lshr_b64 v[37:38], v[37:38], 16 +; SI-NEXT: v_mov_b32_e32 v27, v63 +; SI-NEXT: v_lshr_b64 v[62:63], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v39, v47 +; SI-NEXT: v_lshr_b64 v[47:48], v[45:46], 16 +; SI-NEXT: v_mov_b32_e32 v5, v25 +; SI-NEXT: v_mov_b32_e32 v25, v13 +; SI-NEXT: v_mov_b32_e32 v13, v44 +; SI-NEXT: v_lshr_b64 v[43:44], v[60:61], 16 +; SI-NEXT: v_mov_b32_e32 v52, v12 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: s_branch .LBB59_6 +; SI-NEXT: .LBB59_5: +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v41, s8 +; SI-NEXT: v_mov_b32_e32 v55, s10 +; SI-NEXT: v_mov_b32_e32 v16, s11 +; SI-NEXT: v_mov_b32_e32 v39, s12 +; SI-NEXT: v_mov_b32_e32 v30, s40 +; SI-NEXT: v_mov_b32_e32 v36, s15 +; SI-NEXT: v_mov_b32_e32 v31, s14 +; SI-NEXT: v_mov_b32_e32 v18, s17 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v8, s19 +; SI-NEXT: v_mov_b32_e32 v49, s21 +; SI-NEXT: v_mov_b32_e32 v40, s23 +; SI-NEXT: v_mov_b32_e32 v53, s25 +; SI-NEXT: v_mov_b32_e32 v35, s27 +; SI-NEXT: v_mov_b32_e32 v46, s29 +; SI-NEXT: v_mov_b32_e32 v29, v12 +; SI-NEXT: v_mov_b32_e32 v25, s28 +; SI-NEXT: v_mov_b32_e32 v43, v21 +; SI-NEXT: v_mov_b32_e32 v34, v26 +; SI-NEXT: v_mov_b32_e32 v59, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v3, s24 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v5, v15 +; SI-NEXT: v_mov_b32_e32 v19, v6 +; SI-NEXT: v_mov_b32_e32 v17, v1 +; SI-NEXT: v_mov_b32_e32 v15, v37 +; SI-NEXT: v_mov_b32_e32 v1, s20 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_mov_b32_e32 v21, v9 +; SI-NEXT: v_mov_b32_e32 v26, s43 +; SI-NEXT: v_mov_b32_e32 v11, v28 +; SI-NEXT: v_mov_b32_e32 v28, s42 +; SI-NEXT: v_mov_b32_e32 v52, v20 +; SI-NEXT: v_mov_b32_e32 v20, s41 +; SI-NEXT: v_mov_b32_e32 v56, v62 +; SI-NEXT: v_mov_b32_e32 v62, s13 +; SI-NEXT: v_mov_b32_e32 v32, s9 +; SI-NEXT: v_mov_b32_e32 v14, s7 +; SI-NEXT: v_mov_b32_e32 v47, s6 +; SI-NEXT: v_mov_b32_e32 v9, v24 +; SI-NEXT: .LBB59_6: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_or_b32_e32 v48, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v30 +; SI-NEXT: v_or_b32_e32 v38, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v62 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v42 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v18 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 +; SI-NEXT: v_or_b32_e32 v30, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v54 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v37, v10, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v55 +; SI-NEXT: v_or_b32_e32 v39, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v61 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v41 +; SI-NEXT: v_or_b32_e32 v13, v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v42 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v57 +; SI-NEXT: v_or_b32_e32 v35, v1, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_or_b32_e32 v16, v1, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_or_b32_e32 v17, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_mov_b32_e32 v15, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v18, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_or_b32_e32 v19, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_or_b32_e32 v20, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_or_b32_e32 v21, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_mov_b32_e32 v7, v30 +; SI-NEXT: v_mov_b32_e32 v9, v37 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v22, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_mov_b32_e32 v5, v38 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v57 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v23, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_or_b32_e32 v24, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: v_mov_b32_e32 v11, v39 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v25, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_or_b32_e32 v24, v24, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v56 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v26, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v27, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v28, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -51168,24 +47242,13 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v26, v26, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v28, v28, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v29, v1, v3 +; SI-NEXT: v_mov_b32_e32 v1, v31 +; SI-NEXT: v_mov_b32_e32 v3, v48 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB59_4: -; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v60f16_to_v60i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index ccc46cc5df39e..d44ffdfbc547c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -2611,58 +2611,35 @@ define <6 x half> @bitcast_v3i32_to_v6f16(<3 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB12_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v3, v0, v2, 16 +; SI-NEXT: v_alignbit_b32 v4, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB12_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v0 -; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_alignbit_b32 v4, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v3, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i32_to_v6f16: @@ -2734,50 +2711,35 @@ define inreg <6 x half> @bitcast_v3i32_to_v6f16_scalar(<3 x i32> inreg %a, i32 i ; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_lshr_b32 s4, s18, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s17, 0xffff +; SI-NEXT: s_lshl_b32 s7, s10, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s18, 0xffff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s4, s7, s4 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v3i32_to_v6f16_scalar: @@ -2859,21 +2821,12 @@ define <3 x i32> @bitcast_v6f16_to_v3i32(<6 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v6f16_to_v3i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -2886,35 +2839,38 @@ define <3 x i32> @bitcast_v6f16_to_v3i32(<6 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB14_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v8, v0 -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: .LBB14_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -3003,61 +2959,56 @@ define inreg <3 x i32> @bitcast_v6f16_to_v3i32_scalar(<6 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v6f16_to_v3i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: s_lshr_b32 s7, s18, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s19, 0 -; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: s_cbranch_scc0 .LBB15_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s10, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s8, s7, 16 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_cbranch_execnz .LBB15_4 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB15_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: .LBB15_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 ; SI-NEXT: s_branch .LBB15_2 +; SI-NEXT: .LBB15_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f16_to_v3i32_scalar: ; VI: ; %bb.0: @@ -5905,58 +5856,35 @@ define <6 x half> @bitcast_v3f32_to_v6f16(<3 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB28_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: v_alignbit_b32 v3, v0, v2, 16 +; SI-NEXT: v_alignbit_b32 v4, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB28_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v0 -; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_alignbit_b32 v4, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v3, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f32_to_v6f16: @@ -6025,53 +5953,43 @@ define inreg <6 x half> @bitcast_v3f32_to_v6f16_scalar(<3 x float> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s19, 0 -; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: s_cbranch_scc0 .LBB29_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 -; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB29_4 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v5, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s17, 1.0 ; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[5:6], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[2:3], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: s_branch .LBB29_5 +; SI-NEXT: .LBB29_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB29_2 +; SI-NEXT: .LBB29_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: .LBB29_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f32_to_v6f16_scalar: ; VI: ; %bb.0: @@ -6156,21 +6074,12 @@ define <3 x float> @bitcast_v6f16_to_v3f32(<6 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v6f16_to_v3f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -6183,35 +6092,38 @@ define <3 x float> @bitcast_v6f16_to_v3f32(<6 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB30_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v8, v0 -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: .LBB30_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -6300,61 +6212,56 @@ define inreg <3 x float> @bitcast_v6f16_to_v3f32_scalar(<6 x half> inreg %a, i32 ; SI-LABEL: bitcast_v6f16_to_v3f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: s_lshr_b32 s7, s18, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s19, 0 -; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: s_cbranch_scc0 .LBB31_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s10, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s8, s7, 16 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_cbranch_execnz .LBB31_4 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB31_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: .LBB31_3: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 ; SI-NEXT: s_branch .LBB31_2 +; SI-NEXT: .LBB31_4: +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f16_to_v3f32_scalar: ; VI: ; %bb.0: @@ -8829,101 +8736,111 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v11 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v4, v12, v3 +; SI-NEXT: v_or_b32_e32 v3, v1, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v0, v0, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v7, v3, v1, 16 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_or_b32_e32 v11, v0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x300, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v1, v16, v1 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v1, v13, v1 -; SI-NEXT: v_or_b32_e32 v0, v12, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_alignbit_b32 v7, v3, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i8_to_v6f16: @@ -9274,86 +9191,98 @@ define inreg <6 x half> @bitcast_v12i8_to_v6f16_scalar(<12 x i8> inreg %a, i32 i ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s19, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s23, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s5, s27, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_cbranch_execnz .LBB41_3 -; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s5, s27, 8 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_or_b32 s12, s6, s5 ; SI-NEXT: s_and_b32 s5, s24, 0xff ; SI-NEXT: s_lshl_b32 s6, s25, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s26, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s7, s6 +; SI-NEXT: s_or_b32 s10, s5, s8 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s22, 0xff -; SI-NEXT: s_lshl_b32 s7, s23, 8 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s20, 0xff -; SI-NEXT: s_lshl_b32 s8, s21, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s18, 0xff -; SI-NEXT: s_lshl_b32 s9, s19, 8 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s9, s7, s6 +; SI-NEXT: s_or_b32 s13, s5, s9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshr_b64 s[6:7], s[12:13], 16 +; SI-NEXT: s_or_b32 s4, s4, s12 +; SI-NEXT: s_lshr_b32 s7, s9, 16 +; SI-NEXT: s_lshr_b32 s11, s8, 16 +; SI-NEXT: s_mov_b32 s5, s13 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s16, 0xff -; SI-NEXT: s_lshl_b32 s10, s17, 8 -; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_addk_i32 s7, 0x300 -; SI-NEXT: s_addk_i32 s8, 0x300 -; SI-NEXT: s_addk_i32 s9, 0x300 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s10, s6, 0x3000000 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_lshr_b32 s11, s10, 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s6, s7, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s10, 0xffff +; SI-NEXT: s_lshl_b32 s7, s11, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v12i8_to_v6f16_scalar: @@ -9626,21 +9555,12 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v6f16_to_v12i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 +; SI-NEXT: v_mov_b32_e32 v12, v2 +; SI-NEXT: v_mov_b32_e32 v13, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -9662,12 +9582,15 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB42_3: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v14, v0 -; SI-NEXT: v_or_b32_e32 v4, v13, v1 -; SI-NEXT: v_or_b32_e32 v8, v12, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 +; SI-NEXT: v_or_b32_e32 v8, v7, v8 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -9949,61 +9872,52 @@ define inreg <12 x i8> @bitcast_v6f16_to_v12i8_scalar(<6 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v6f16_to_v12i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: s_lshr_b32 s14, s18, 16 +; SI-NEXT: s_lshr_b32 s15, s17, 16 +; SI-NEXT: s_lshr_b32 s20, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s19, 0 -; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: s_cbranch_scc0 .LBB43_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_or_b32_e32 v12, v15, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v13, v14, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[12:13], 8 -; SI-NEXT: v_lshr_b64 v[3:4], v[12:13], 24 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_lshr_b64 v[4:5], v[12:13], 16 -; SI-NEXT: v_or_b32_e32 v8, v0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v13 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v11, v10, 8, 8 -; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s20, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s15, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s14, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; SI-NEXT: s_or_b32 s11, s9, s11 +; SI-NEXT: s_lshr_b32 s7, s5, 8 +; SI-NEXT: s_lshr_b32 s9, s11, 8 +; SI-NEXT: s_bfe_u32 s19, s15, 0x80008 +; SI-NEXT: s_bfe_u32 s21, s14, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB43_4 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v12, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v12, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v13, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v13, v1, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 ; SI-NEXT: v_lshr_b64 v[3:4], v[12:13], 24 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 @@ -10013,22 +9927,36 @@ define inreg <12 x i8> @bitcast_v6f16_to_v12i8_scalar(<6 x half> inreg %a, i32 i ; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 ; SI-NEXT: v_bfe_u32 v11, v10, 8, 8 -; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: s_branch .LBB43_5 +; SI-NEXT: .LBB43_3: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: s_branch .LBB43_2 +; SI-NEXT: .LBB43_4: +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: v_mov_b32_e32 v6, s15 +; SI-NEXT: v_mov_b32_e32 v11, s21 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v13, s5 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: v_mov_b32_e32 v9, s9 +; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: .LBB43_5: ; %end ; SI-NEXT: v_mov_b32_e32 v0, v12 ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v4, v13 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB43_4: -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v6f16_to_v12i8_scalar: ; VI: ; %bb.0: @@ -11651,84 +11579,72 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB48_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_alignbit_b32 v4, v1, v7, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v5, v4, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; SI-NEXT: v_alignbit_b32 v3, v2, v9, 16 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v4, v1, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_alignbit_b32 v5, v4, v5, 16 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -12027,73 +11943,60 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i ; SI-NEXT: s_and_b32 s8, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s9, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s19, 0 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s9 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s8 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s7 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s6 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s5 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s7 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s9 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s5 ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshr_b64 v[1:2], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; SI-NEXT: v_lshr_b64 v[7:8], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_lshr_b64 v[12:13], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[3:4], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshr_b64 v[7:8], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshr_b64 v[1:2], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_lshr_b64 v[8:9], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[0:1], 16 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v6bf16_to_v6f16_scalar: @@ -12431,85 +12334,73 @@ define <6 x bfloat> @bitcast_v6f16_to_v6bf16(<6 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v6f16_to_v6bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: .LBB50_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f16_to_v6bf16: @@ -12587,78 +12478,74 @@ define inreg <6 x bfloat> @bitcast_v6f16_to_v6bf16_scalar(<6 x half> inreg %a, i ; SI-LABEL: bitcast_v6f16_to_v6bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s19, 0 -; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: s_cbranch_scc0 .LBB51_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: s_lshl_b32 s9, s16, 16 +; SI-NEXT: s_lshl_b32 s10, s6, 16 +; SI-NEXT: s_lshl_b32 s11, s17, 16 +; SI-NEXT: s_lshl_b32 s12, s7, 16 +; SI-NEXT: s_lshl_b32 s13, s18, 16 +; SI-NEXT: s_lshl_b32 s14, s8, 16 +; SI-NEXT: s_cbranch_execnz .LBB51_4 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: .LBB51_3: ; %end -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_branch .LBB51_5 +; SI-NEXT: .LBB51_3: +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: s_branch .LBB51_2 +; SI-NEXT: .LBB51_4: +; SI-NEXT: v_mov_b32_e32 v4, s14 +; SI-NEXT: v_mov_b32_e32 v3, s13 +; SI-NEXT: v_mov_b32_e32 v5, s12 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: .LBB51_5: ; %end +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; SI-NEXT: v_lshr_b64 v[1:2], v[4:5], 16 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[5:6], 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB51_4: -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v6f16_to_v6bf16_scalar: ; VI: ; %bb.0: @@ -13769,62 +13656,50 @@ define <6 x i16> @bitcast_v6f16_to_v6i16(<6 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v6f16_to_v6i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_alignbit_b32 v6, v2, v3, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v3, 16 ; SI-NEXT: .LBB56_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -13903,64 +13778,60 @@ define inreg <6 x i16> @bitcast_v6f16_to_v6i16_scalar(<6 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v6f16_to_v6i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s17, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s16, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: s_lshr_b32 s7, s18, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s19, 0 -; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: s_cbranch_scc0 .LBB57_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: s_cbranch_execnz .LBB57_4 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v6, v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v5, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 ; SI-NEXT: v_or_b32_e32 v3, v8, v7 -; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: s_branch .LBB57_5 +; SI-NEXT: .LBB57_3: +; SI-NEXT: s_branch .LBB57_2 +; SI-NEXT: .LBB57_4: +; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v3, s18 +; SI-NEXT: v_mov_b32_e32 v5, s16 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: .LBB57_5: ; %end +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB57_4: -; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v6f16_to_v6i16_scalar: ; VI: ; %bb.0: @@ -14059,62 +13930,63 @@ define <6 x half> @bitcast_v6i16_to_v6f16(<6 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v6i16_to_v6f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v4 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v1, v11 +; SI-NEXT: v_or_b32_e32 v3, v0, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_alignbit_b32 v7, v6, v10, 16 +; SI-NEXT: v_or_b32_e32 v8, v0, v9 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: .LBB58_2: ; %Flow +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v11 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_alignbit_b32 v7, v6, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v8 ; SI-NEXT: .LBB58_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i16_to_v6f16: @@ -14191,53 +14063,61 @@ define inreg <6 x half> @bitcast_v6i16_to_v6f16_scalar(<6 x i16> inreg %a, i32 i ; SI-LABEL: bitcast_v6i16_to_v6f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s8, s18, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s10, s18, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_lshr_b32 s13, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s10, 16 +; SI-NEXT: s_or_b32 s12, s5, s6 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s11, 16 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s14, s13, 16 +; SI-NEXT: s_or_b32 s15, s5, s6 +; SI-NEXT: s_or_b32 s4, s4, s14 +; SI-NEXT: s_lshr_b64 s[6:7], s[14:15], 16 +; SI-NEXT: s_mov_b32 s5, s15 ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: s_add_i32 s8, s8, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_lshl_b32 s6, s11, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s10, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s12, s6, 0x30000 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s11, s5, 16 +; SI-NEXT: s_lshr_b32 s10, s12, 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s6, s11, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s12, 0xffff +; SI-NEXT: s_lshl_b32 s7, s10, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v6i16_to_v6f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 12cb8d2f6fb51..f52a33c7c0f8d 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -11918,20 +11918,18 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, half %val) { ; GFX7LESS-LABEL: uniform_fadd_f16: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0xd ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0xd ; GFX7LESS-NEXT: s_mov_b64 s[8:9], 0 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v0, s6 ; GFX7LESS-NEXT: s_and_b32 s4, s2, -4 ; GFX7LESS-NEXT: s_mov_b32 s5, s3 ; GFX7LESS-NEXT: s_and_b32 s2, s2, 3 -; GFX7LESS-NEXT: s_lshl_b32 s10, s2, 3 ; GFX7LESS-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v4, s6 +; GFX7LESS-NEXT: s_lshl_b32 s10, s2, 3 ; GFX7LESS-NEXT: s_lshl_b32 s2, 0xffff, s10 -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v4, v0 ; GFX7LESS-NEXT: s_not_b32 s2, s2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 @@ -13197,60 +13195,52 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, <2 x half> %val) { ; GFX7LESS-LABEL: uniform_fadd_v2f16: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0xd ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dword s4, s[4:5], 0xd ; GFX7LESS-NEXT: s_mov_b64 s[8:9], 0 -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_lshr_b32 s4, s6, 16 -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v0, s6 ; GFX7LESS-NEXT: s_load_dword s5, s[2:3], 0x0 -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v1, s4 -; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS-NEXT: s_lshr_b32 s6, s4, 16 +; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v2, s4 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v2, s5 ; GFX7LESS-NEXT: s_lshr_b32 s4, s5, 16 -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v3, s4 +; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v3, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s5 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: s_mov_b32 s4, s2 ; GFX7LESS-NEXT: s_mov_b32 s5, s3 ; GFX7LESS-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7LESS-NEXT: v_add_f32_e32 v4, v4, v1 -; GFX7LESS-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7LESS-NEXT: v_or_b32_e32 v3, v2, v3 -; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7LESS-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[4:5], off, s[4:7], 0 glc +; GFX7LESS-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7LESS-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7LESS-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7LESS-NEXT: v_or_b32_e32 v4, v0, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v5 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7LESS-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v2, v4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX7LESS-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX7LESS-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll index 2761cba5ea71b..02d7b50e23b5d 100644 --- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll @@ -20,20 +20,18 @@ define amdgpu_kernel void @br_cc_f16( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 -; SI-NEXT: s_cbranch_vccnz .LBB0_2 -; SI-NEXT: ; %bb.1: ; %one -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_branch .LBB0_3 -; SI-NEXT: .LBB0_2: ; %two -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: .LBB0_3: ; %one ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v2, v3 +; SI-NEXT: s_cbranch_vccnz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %one ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm +; SI-NEXT: .LBB0_2: ; %two +; SI-NEXT: buffer_store_short v1, off, s[0:3], 0 +; SI-NEXT: s_endpgm ; ; VI-LABEL: br_cc_f16: ; VI: ; %bb.0: ; %entry @@ -145,20 +143,15 @@ define amdgpu_kernel void @br_cc_f16_imm_a( ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0.5, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0.5, v1 ; SI-NEXT: s_cbranch_vccnz .LBB1_2 ; SI-NEXT: ; %bb.1: ; %one -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, 0x3800 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 -; SI-NEXT: s_endpgm ; SI-NEXT: .LBB1_2: ; %two -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -249,19 +242,16 @@ define amdgpu_kernel void @br_cc_f16_imm_b( ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0.5, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0.5, v1 ; SI-NEXT: s_cbranch_vccnz .LBB2_2 ; SI-NEXT: ; %bb.1: ; %one -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB2_2: ; %two -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, 0x3800 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index d8ef44361c40d..27308e82a3354 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -3672,14 +3672,12 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3712,14 +3710,12 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v4, s4 ; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4100,14 +4096,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4139,14 +4133,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4832,12 +4824,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 -; GFX7-NEXT: v_not_b32_e32 v10, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -4848,28 +4839,27 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 ; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v6, v5, v10 +; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_add_f32_e32 v4, v4, v11 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -4881,33 +4871,32 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB15_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v5, v6 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB15_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 -; GFX6-NEXT: v_not_b32_e32 v10, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -4918,29 +4907,27 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v4 ; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, v5, v10 +; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_add_f32_e32 v4, v4, v11 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -4952,21 +4939,21 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB15_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v5, v6 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB15_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 @@ -7039,50 +7026,40 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -7091,53 +7068,42 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7276,42 +7242,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7324,49 +7281,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7703,9 +7650,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7716,38 +7661,33 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v8 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_add_f32_e32 v8, v8, v9 +; GFX7-NEXT: v_add_f32_e32 v10, v10, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-NEXT: v_or_b32_e32 v8, v6, v7 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v5, v6 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v9 -; GFX7-NEXT: v_add_f32_e32 v8, v8, v10 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; GFX7-NEXT: v_or_b32_e32 v7, v10, v6 +; GFX7-NEXT: v_mov_b32_e32 v6, v7 +; GFX7-NEXT: v_mov_b32_e32 v7, v8 ; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7759,34 +7699,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB21_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 @@ -7798,36 +7733,32 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v9 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v10 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v11, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v6, v4 +; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v11 ; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 @@ -7847,21 +7778,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_cbranch_execnz .LBB21_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB21_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -8034,50 +7963,40 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: @@ -8086,53 +8005,42 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -8301,42 +8209,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8349,49 +8248,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -8564,50 +8453,40 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: @@ -8616,53 +8495,42 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -8831,42 +8699,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8879,49 +8738,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB25_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index fc3ed6d332211..5b5fb8f3a1663 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -2773,14 +2773,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2813,14 +2811,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v4, s4 ; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3222,14 +3218,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3261,14 +3255,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3976,12 +3968,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 -; GFX7-NEXT: v_not_b32_e32 v10, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -3992,28 +3983,27 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 ; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v6, v5, v10 +; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_max_f32_e32 v4, v4, v11 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v10 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -4025,33 +4015,32 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB12_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v5, v6 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB12_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 -; GFX6-NEXT: v_not_b32_e32 v10, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -4062,29 +4051,27 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v4 ; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB12_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, v5, v10 +; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_max_f32_e32 v4, v4, v11 +; GFX6-NEXT: v_max_f32_e32 v4, v4, v10 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -4096,21 +4083,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v5, v6 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB12_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 @@ -6259,50 +6246,40 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6311,53 +6288,42 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6575,42 +6541,33 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6623,49 +6580,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7111,9 +7058,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7124,38 +7069,33 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v8 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_max_f32_e32 v8, v8, v9 +; GFX7-NEXT: v_max_f32_e32 v10, v10, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-NEXT: v_or_b32_e32 v8, v6, v7 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v5, v6 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v9 -; GFX7-NEXT: v_max_f32_e32 v8, v8, v10 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; GFX7-NEXT: v_or_b32_e32 v7, v10, v6 +; GFX7-NEXT: v_mov_b32_e32 v6, v7 +; GFX7-NEXT: v_mov_b32_e32 v7, v8 ; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7167,34 +7107,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB18_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB18_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -7206,36 +7141,32 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v9 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v10 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v11, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v6, v4 +; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v11 ; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 @@ -7255,21 +7186,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_cbranch_execnz .LBB18_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB18_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 8f270f9a466e2..c1c512b9c0a18 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -2773,14 +2773,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2813,14 +2811,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v4, s4 ; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3222,14 +3218,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3261,14 +3255,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3976,12 +3968,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 -; GFX7-NEXT: v_not_b32_e32 v10, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -3992,28 +3983,27 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 ; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v6, v5, v10 +; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_min_f32_e32 v4, v4, v11 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v10 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -4025,33 +4015,32 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB12_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v5, v6 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB12_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 -; GFX6-NEXT: v_not_b32_e32 v10, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -4062,29 +4051,27 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v4 ; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB12_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, v5, v10 +; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_min_f32_e32 v4, v4, v11 +; GFX6-NEXT: v_min_f32_e32 v4, v4, v10 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -4096,21 +4083,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v5, v6 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB12_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 @@ -6259,50 +6246,40 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: @@ -6311,53 +6288,42 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6575,42 +6541,33 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6623,49 +6580,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX6-NEXT: v_mov_b32_e32 v1, s20 ; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7111,9 +7058,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7124,38 +7069,33 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v8 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_min_f32_e32 v8, v8, v9 +; GFX7-NEXT: v_min_f32_e32 v10, v10, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-NEXT: v_or_b32_e32 v8, v6, v7 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v5, v6 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v9 -; GFX7-NEXT: v_min_f32_e32 v8, v8, v10 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; GFX7-NEXT: v_or_b32_e32 v7, v10, v6 +; GFX7-NEXT: v_mov_b32_e32 v6, v7 +; GFX7-NEXT: v_mov_b32_e32 v7, v8 ; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -7167,34 +7107,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB18_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB18_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -7206,36 +7141,32 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v4, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v9 ; GFX6-NEXT: v_min_f32_e32 v7, v7, v10 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v11, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v6, v4 +; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v11 ; GFX6-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 @@ -7255,21 +7186,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_cbranch_execnz .LBB18_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB18_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index 689f9d7d59550..95f7744d94882 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -786,20 +786,20 @@ define amdgpu_ps <4 x half> @ps_mesa_v4f16(<4 x half> %arg0) { ; SI: ; %bb.0: ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: ; return to shader part epilog @@ -837,22 +837,22 @@ define amdgpu_ps <4 x half> @ps_mesa_inreg_v4f16(<4 x half> inreg %arg0) { ; SI: ; %bb.0: ; SI-NEXT: s_lshr_b32 s2, s0, 16 ; SI-NEXT: s_lshr_b32 s3, s1, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s3 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v4, v2 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: ps_mesa_inreg_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll index b5e0d3aeace32..638e4b01488a0 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -450,7 +450,13 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -533,7 +539,13 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -839,9 +851,21 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e64 v3, v3, 1.0 clamp -; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v3, 0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -918,9 +942,21 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) % ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e64 v3, v3, 1.0 clamp -; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v3, 0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -998,16 +1034,24 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp -; SI-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v3, 0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1087,12 +1131,23 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e64 v3, v3, 1.0 clamp +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-NEXT: v_max_f32_e32 v3, 0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_max_f32_e64 v2, -v2, -v2 clamp +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1176,9 +1231,20 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_max_f32_e64 v3, -v3, -v3 clamp -; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e64 v3, -v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v3, 0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1261,9 +1327,21 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o ; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e64 v3, v3, 1.0 clamp +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v3, 0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1343,9 +1421,17 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp -; SI-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v3, 0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1509,11 +1595,18 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou ; SI-NEXT: buffer_load_ushort v1, v[1:2], s[4:7], 0 addr64 ; SI-NEXT: v_cvt_f16_f32_e32 v3, 0 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_max_f32_e32 v1, 0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_min_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 @@ -1607,9 +1700,17 @@ define <2 x half> @v_clamp_cvt_pkrtz_src_v2f16_denorm(float %a, float %b) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp -; SI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_max_f32_e32 v1, 0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_max_f32_e32 v0, 0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_min_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_min_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1933,7 +2034,13 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm_minimumnum_maximumnum(ptr ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -2016,7 +2123,13 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals_minimumnum_maximumnu ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -2173,9 +2286,21 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_minimumnum_maximumnum(pt ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e64 v3, v3, 1.0 clamp -; SI-NEXT: v_add_f32_e64 v2, v2, 1.0 clamp +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v3, 0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v2, 0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index 711e2f2951fae..0d3567faaa10c 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -554,7 +554,11 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX6-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -662,7 +666,11 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -v2 clamp +; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -771,7 +779,11 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; GFX6-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -|v2| clamp +; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -|v2| +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -2980,9 +2992,17 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -3066,14 +3086,18 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2 -; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -3167,9 +3191,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_max_f32_e32 v2, 2.0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -3268,10 +3298,17 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_med3_f32 v2, v2, 0, 0 +; GFX6-NEXT: v_min_f32_e32 v2, 0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -3369,9 +3406,17 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -3460,9 +3505,17 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v2, 0x80008000, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -3555,10 +3608,17 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -v2 clamp +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -3648,9 +3708,17 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v3, -v3 clamp -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp +; GFX6-NEXT: v_cvt_f32_f16_e64 v3, -v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -3739,13 +3807,21 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp -; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; @@ -3827,16 +3903,20 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4 +; GFX6-NEXT: v_min_f32_e32 v2, 0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -3925,14 +4005,18 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v3, 0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2 -; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v3, 0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -4352,7 +4436,11 @@ define half @v_clamp_f16_minimumnum_maximumnum(half %a) #1 { ; GFX6-LABEL: v_clamp_f16_minimumnum_maximumnum: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_max_f32_e32 v0, 0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_min_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -4408,7 +4496,11 @@ define half @v_clamp_f16_minimumnum_maximumnum_no_ieee(half %a) #5 { ; GFX6-LABEL: v_clamp_f16_minimumnum_maximumnum_no_ieee: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_max_f32_e32 v0, 0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_min_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -4466,7 +4558,13 @@ define half @v_clamp_f16_minimumnum_maximumnum_foldable_source(half %a, half %b) ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_add_f32_e64 v0, v0, v1 clamp +; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_max_f32_e32 v0, 0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_min_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -4525,7 +4623,13 @@ define half @v_clamp_f16_minimumnum_maximumnum_no_ieee_foldable_source(half %a, ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_add_f32_e64 v0, v0, v1 clamp +; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_max_f32_e32 v0, 0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_min_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll index c48efc925ea8b..11edef2929d7d 100644 --- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -1108,13 +1108,13 @@ define amdgpu_vs <3 x half> @load_v3i16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_lshr_b32 s4, s2, 16 ; GFX67-NEXT: s_lshr_b32 s5, s0, 16 -; GFX67-NEXT: s_add_i32 s0, s0, s2 ; GFX67-NEXT: s_add_i32 s5, s5, s4 +; GFX67-NEXT: s_add_i32 s0, s0, s2 ; GFX67-NEXT: s_add_i32 s1, s1, s3 +; GFX67-NEXT: s_lshl_b32 s3, s5, 16 ; GFX67-NEXT: s_and_b32 s0, s0, 0xffff -; GFX67-NEXT: s_lshl_b32 s2, s5, 16 -; GFX67-NEXT: s_or_b32 s0, s0, s2 ; GFX67-NEXT: s_and_b32 s1, s1, 0xffff +; GFX67-NEXT: s_or_b32 s0, s0, s3 ; GFX67-NEXT: v_mov_b32_e32 v0, s0 ; GFX67-NEXT: v_mov_b32_e32 v1, s1 ; GFX67-NEXT: ; return to shader part epilog @@ -1170,19 +1170,19 @@ define amdgpu_vs <4 x half> @load_v4i16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_lshr_b32 s4, s0, 16 -; GFX67-NEXT: s_lshr_b32 s6, s2, 16 ; GFX67-NEXT: s_lshr_b32 s5, s1, 16 +; GFX67-NEXT: s_lshr_b32 s6, s2, 16 ; GFX67-NEXT: s_lshr_b32 s7, s3, 16 -; GFX67-NEXT: s_add_i32 s0, s0, s2 -; GFX67-NEXT: s_add_i32 s4, s4, s6 -; GFX67-NEXT: s_add_i32 s1, s1, s3 ; GFX67-NEXT: s_add_i32 s5, s5, s7 +; GFX67-NEXT: s_add_i32 s1, s1, s3 +; GFX67-NEXT: s_add_i32 s4, s4, s6 +; GFX67-NEXT: s_add_i32 s0, s0, s2 +; GFX67-NEXT: s_lshl_b32 s5, s5, 16 +; GFX67-NEXT: s_lshl_b32 s3, s4, 16 ; GFX67-NEXT: s_and_b32 s0, s0, 0xffff -; GFX67-NEXT: s_lshl_b32 s2, s4, 16 ; GFX67-NEXT: s_and_b32 s1, s1, 0xffff -; GFX67-NEXT: s_or_b32 s0, s0, s2 -; GFX67-NEXT: s_lshl_b32 s2, s5, 16 -; GFX67-NEXT: s_or_b32 s1, s1, s2 +; GFX67-NEXT: s_or_b32 s0, s0, s3 +; GFX67-NEXT: s_or_b32 s1, s1, s5 ; GFX67-NEXT: v_mov_b32_e32 v0, s0 ; GFX67-NEXT: v_mov_b32_e32 v1, s1 ; GFX67-NEXT: ; return to shader part epilog @@ -1246,23 +1246,23 @@ define amdgpu_vs <6 x half> @load_v6i16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_lshr_b32 s3, s0, 16 ; GFX67-NEXT: s_lshr_b32 s9, s4, 16 ; GFX67-NEXT: s_lshr_b32 s7, s1, 16 -; GFX67-NEXT: s_lshr_b32 s10, s5, 16 -; GFX67-NEXT: s_add_i32 s0, s0, s4 -; GFX67-NEXT: s_add_i32 s3, s3, s9 ; GFX67-NEXT: s_lshr_b32 s8, s2, 16 +; GFX67-NEXT: s_lshr_b32 s10, s5, 16 ; GFX67-NEXT: s_lshr_b32 s11, s6, 16 -; GFX67-NEXT: s_add_i32 s1, s1, s5 +; GFX67-NEXT: s_add_i32 s3, s3, s9 +; GFX67-NEXT: s_add_i32 s0, s0, s4 +; GFX67-NEXT: s_add_i32 s8, s8, s11 +; GFX67-NEXT: s_add_i32 s2, s2, s6 ; GFX67-NEXT: s_add_i32 s7, s7, s10 +; GFX67-NEXT: s_add_i32 s1, s1, s5 ; GFX67-NEXT: s_and_b32 s0, s0, 0xffff ; GFX67-NEXT: s_lshl_b32 s3, s3, 16 -; GFX67-NEXT: s_add_i32 s2, s2, s6 -; GFX67-NEXT: s_add_i32 s8, s8, s11 -; GFX67-NEXT: s_and_b32 s1, s1, 0xffff +; GFX67-NEXT: s_lshl_b32 s6, s7, 16 ; GFX67-NEXT: s_or_b32 s0, s0, s3 -; GFX67-NEXT: s_lshl_b32 s3, s7, 16 +; GFX67-NEXT: s_and_b32 s1, s1, 0xffff ; GFX67-NEXT: s_and_b32 s2, s2, 0xffff -; GFX67-NEXT: s_or_b32 s1, s1, s3 ; GFX67-NEXT: s_lshl_b32 s3, s8, 16 +; GFX67-NEXT: s_or_b32 s1, s1, s6 ; GFX67-NEXT: s_or_b32 s2, s2, s3 ; GFX67-NEXT: v_mov_b32_e32 v0, s0 ; GFX67-NEXT: v_mov_b32_e32 v1, s1 @@ -1338,31 +1338,31 @@ define amdgpu_vs <8 x half> @load_v8i16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_lshr_b32 s8, s0, 16 ; GFX67-NEXT: s_lshr_b32 s12, s4, 16 ; GFX67-NEXT: s_lshr_b32 s9, s1, 16 -; GFX67-NEXT: s_lshr_b32 s13, s5, 16 -; GFX67-NEXT: s_add_i32 s0, s0, s4 -; GFX67-NEXT: s_add_i32 s8, s8, s12 ; GFX67-NEXT: s_lshr_b32 s10, s2, 16 +; GFX67-NEXT: s_lshr_b32 s11, s3, 16 +; GFX67-NEXT: s_lshr_b32 s13, s5, 16 ; GFX67-NEXT: s_lshr_b32 s14, s6, 16 -; GFX67-NEXT: s_add_i32 s1, s1, s5 +; GFX67-NEXT: s_lshr_b32 s15, s7, 16 +; GFX67-NEXT: s_add_i32 s8, s8, s12 +; GFX67-NEXT: s_add_i32 s0, s0, s4 +; GFX67-NEXT: s_add_i32 s11, s11, s15 +; GFX67-NEXT: s_add_i32 s3, s3, s7 +; GFX67-NEXT: s_add_i32 s10, s10, s14 +; GFX67-NEXT: s_add_i32 s2, s2, s6 ; GFX67-NEXT: s_add_i32 s9, s9, s13 +; GFX67-NEXT: s_add_i32 s1, s1, s5 ; GFX67-NEXT: s_and_b32 s0, s0, 0xffff ; GFX67-NEXT: s_lshl_b32 s4, s8, 16 -; GFX67-NEXT: s_lshr_b32 s11, s3, 16 -; GFX67-NEXT: s_lshr_b32 s15, s7, 16 -; GFX67-NEXT: s_add_i32 s2, s2, s6 -; GFX67-NEXT: s_add_i32 s10, s10, s14 -; GFX67-NEXT: s_and_b32 s1, s1, 0xffff +; GFX67-NEXT: s_lshl_b32 s11, s11, 16 +; GFX67-NEXT: s_lshl_b32 s6, s9, 16 ; GFX67-NEXT: s_or_b32 s0, s0, s4 -; GFX67-NEXT: s_lshl_b32 s4, s9, 16 -; GFX67-NEXT: s_add_i32 s3, s3, s7 -; GFX67-NEXT: s_add_i32 s11, s11, s15 +; GFX67-NEXT: s_and_b32 s1, s1, 0xffff ; GFX67-NEXT: s_and_b32 s2, s2, 0xffff -; GFX67-NEXT: s_or_b32 s1, s1, s4 ; GFX67-NEXT: s_lshl_b32 s4, s10, 16 ; GFX67-NEXT: s_and_b32 s3, s3, 0xffff +; GFX67-NEXT: s_or_b32 s1, s1, s6 ; GFX67-NEXT: s_or_b32 s2, s2, s4 -; GFX67-NEXT: s_lshl_b32 s4, s11, 16 -; GFX67-NEXT: s_or_b32 s3, s3, s4 +; GFX67-NEXT: s_or_b32 s3, s3, s11 ; GFX67-NEXT: v_mov_b32_e32 v0, s0 ; GFX67-NEXT: v_mov_b32_e32 v1, s1 ; GFX67-NEXT: v_mov_b32_e32 v2, s2 @@ -1447,60 +1447,60 @@ define amdgpu_vs <16 x half> @load_v16i16(ptr addrspace(6) inreg %p0, ptr addrsp ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_lshr_b32 s16, s0, 16 ; GFX67-NEXT: s_lshr_b32 s24, s8, 16 -; GFX67-NEXT: s_lshr_b32 s17, s1, 16 -; GFX67-NEXT: s_lshr_b32 s25, s9, 16 -; GFX67-NEXT: s_add_i32 s0, s0, s8 -; GFX67-NEXT: s_add_i32 s16, s16, s24 ; GFX67-NEXT: s_lshr_b32 s18, s2, 16 ; GFX67-NEXT: s_lshr_b32 s26, s10, 16 -; GFX67-NEXT: s_add_i32 s1, s1, s9 -; GFX67-NEXT: s_add_i32 s17, s17, s25 +; GFX67-NEXT: s_add_i32 s16, s16, s24 +; GFX67-NEXT: s_add_i32 s0, s0, s8 +; GFX67-NEXT: s_lshr_b32 s20, s4, 16 +; GFX67-NEXT: s_lshr_b32 s28, s12, 16 +; GFX67-NEXT: s_add_i32 s18, s18, s26 +; GFX67-NEXT: s_add_i32 s2, s2, s10 ; GFX67-NEXT: s_and_b32 s0, s0, 0xffff ; GFX67-NEXT: s_lshl_b32 s8, s16, 16 +; GFX67-NEXT: s_lshr_b32 s17, s1, 16 ; GFX67-NEXT: s_lshr_b32 s19, s3, 16 +; GFX67-NEXT: s_lshr_b32 s21, s5, 16 +; GFX67-NEXT: s_lshr_b32 s22, s6, 16 +; GFX67-NEXT: s_lshr_b32 s23, s7, 16 +; GFX67-NEXT: s_lshr_b32 s25, s9, 16 ; GFX67-NEXT: s_lshr_b32 s27, s11, 16 -; GFX67-NEXT: s_add_i32 s2, s2, s10 -; GFX67-NEXT: s_add_i32 s18, s18, s26 -; GFX67-NEXT: s_and_b32 s1, s1, 0xffff +; GFX67-NEXT: s_lshr_b32 s29, s13, 16 +; GFX67-NEXT: s_lshr_b32 s30, s14, 16 +; GFX67-NEXT: s_lshr_b32 s31, s15, 16 +; GFX67-NEXT: s_add_i32 s20, s20, s28 +; GFX67-NEXT: s_add_i32 s4, s4, s12 ; GFX67-NEXT: s_or_b32 s0, s0, s8 -; GFX67-NEXT: s_lshl_b32 s8, s17, 16 -; GFX67-NEXT: s_lshr_b32 s20, s4, 16 -; GFX67-NEXT: s_lshr_b32 s28, s12, 16 -; GFX67-NEXT: s_add_i32 s3, s3, s11 -; GFX67-NEXT: s_add_i32 s19, s19, s27 ; GFX67-NEXT: s_and_b32 s2, s2, 0xffff -; GFX67-NEXT: s_or_b32 s1, s1, s8 ; GFX67-NEXT: s_lshl_b32 s8, s18, 16 -; GFX67-NEXT: s_lshr_b32 s21, s5, 16 -; GFX67-NEXT: s_lshr_b32 s29, s13, 16 -; GFX67-NEXT: s_add_i32 s4, s4, s12 -; GFX67-NEXT: s_add_i32 s20, s20, s28 -; GFX67-NEXT: s_and_b32 s3, s3, 0xffff -; GFX67-NEXT: s_or_b32 s2, s2, s8 -; GFX67-NEXT: s_lshl_b32 s8, s19, 16 -; GFX67-NEXT: s_lshr_b32 s22, s6, 16 -; GFX67-NEXT: s_lshr_b32 s30, s14, 16 -; GFX67-NEXT: s_add_i32 s5, s5, s13 +; GFX67-NEXT: s_add_i32 s23, s23, s31 +; GFX67-NEXT: s_add_i32 s7, s7, s15 +; GFX67-NEXT: s_add_i32 s22, s22, s30 +; GFX67-NEXT: s_add_i32 s6, s6, s14 ; GFX67-NEXT: s_add_i32 s21, s21, s29 +; GFX67-NEXT: s_add_i32 s5, s5, s13 +; GFX67-NEXT: s_add_i32 s19, s19, s27 +; GFX67-NEXT: s_add_i32 s3, s3, s11 +; GFX67-NEXT: s_add_i32 s17, s17, s25 +; GFX67-NEXT: s_add_i32 s1, s1, s9 +; GFX67-NEXT: s_or_b32 s2, s2, s8 ; GFX67-NEXT: s_and_b32 s4, s4, 0xffff -; GFX67-NEXT: s_or_b32 s3, s3, s8 ; GFX67-NEXT: s_lshl_b32 s8, s20, 16 -; GFX67-NEXT: s_lshr_b32 s23, s7, 16 -; GFX67-NEXT: s_lshr_b32 s31, s15, 16 -; GFX67-NEXT: s_add_i32 s6, s6, s14 -; GFX67-NEXT: s_add_i32 s22, s22, s30 -; GFX67-NEXT: s_and_b32 s5, s5, 0xffff +; GFX67-NEXT: s_lshl_b32 s23, s23, 16 +; GFX67-NEXT: s_lshl_b32 s14, s21, 16 +; GFX67-NEXT: s_lshl_b32 s12, s19, 16 +; GFX67-NEXT: s_lshl_b32 s10, s17, 16 +; GFX67-NEXT: s_and_b32 s1, s1, 0xffff +; GFX67-NEXT: s_and_b32 s3, s3, 0xffff ; GFX67-NEXT: s_or_b32 s4, s4, s8 -; GFX67-NEXT: s_lshl_b32 s8, s21, 16 -; GFX67-NEXT: s_add_i32 s7, s7, s15 -; GFX67-NEXT: s_add_i32 s23, s23, s31 +; GFX67-NEXT: s_and_b32 s5, s5, 0xffff ; GFX67-NEXT: s_and_b32 s6, s6, 0xffff -; GFX67-NEXT: s_or_b32 s5, s5, s8 ; GFX67-NEXT: s_lshl_b32 s8, s22, 16 ; GFX67-NEXT: s_and_b32 s7, s7, 0xffff +; GFX67-NEXT: s_or_b32 s1, s1, s10 +; GFX67-NEXT: s_or_b32 s3, s3, s12 +; GFX67-NEXT: s_or_b32 s5, s5, s14 ; GFX67-NEXT: s_or_b32 s6, s6, s8 -; GFX67-NEXT: s_lshl_b32 s8, s23, 16 -; GFX67-NEXT: s_or_b32 s7, s7, s8 +; GFX67-NEXT: s_or_b32 s7, s7, s23 ; GFX67-NEXT: v_mov_b32_e32 v0, s0 ; GFX67-NEXT: v_mov_b32_e32 v1, s1 ; GFX67-NEXT: v_mov_b32_e32 v2, s2 @@ -1820,11 +1820,11 @@ define amdgpu_vs <2 x half> @load_v2f16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_lshr_b32 s1, s2, 16 ; GFX67-NEXT: s_lshr_b32 s3, s0, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s3 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s3 ; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s2 ; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s0 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX67-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX67-NEXT: v_add_f32_e32 v1, v3, v2 ; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -1871,28 +1871,28 @@ define amdgpu_vs <2 x half> @load_v2f16(ptr addrspace(6) inreg %p0, ptr addrspac define amdgpu_vs <3 x half> @load_v3f16(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 { ; GFX67-LABEL: load_v3f16: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_mov_b32 s3, 0 ; GFX67-NEXT: s_mov_b32 s2, s1 +; GFX67-NEXT: s_mov_b32 s3, 0 +; GFX67-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4 ; GFX67-NEXT: s_mov_b32 s1, s3 ; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX67-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s4, s0, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX67-NEXT: s_lshr_b32 s0, s2, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s4 -; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s0 +; GFX67-NEXT: s_lshr_b32 s2, s4, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX67-NEXT: s_lshr_b32 s2, s0, 16 ; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s3 -; GFX67-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s4 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s5 +; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s1 +; GFX67-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v1 -; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX67-NEXT: v_add_f32_e32 v1, v4, v5 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX67-NEXT: v_add_f32_e32 v1, v4, v3 +; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v1 +; GFX67-NEXT: v_add_f32_e32 v0, v5, v0 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX67-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v3f16: @@ -1941,33 +1941,33 @@ define amdgpu_vs <4 x half> @load_v4f16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_mov_b32 s3, 0 ; GFX67-NEXT: s_mov_b32 s2, s1 ; GFX67-NEXT: s_mov_b32 s1, s3 -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX67-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4 +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s4, s0, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX67-NEXT: s_lshr_b32 s0, s1, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s0 -; GFX67-NEXT: s_lshr_b32 s0, s2, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s4 -; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s1 -; GFX67-NEXT: s_lshr_b32 s1, s3, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s0 -; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s3 -; GFX67-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX67-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v6 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX67-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX67-NEXT: s_lshr_b32 s4, s3, 16 +; GFX67-NEXT: s_lshr_b32 s5, s1, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s5 +; GFX67-NEXT: s_lshr_b32 s4, s2, 16 +; GFX67-NEXT: s_lshr_b32 s5, s0, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s1 +; GFX67-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s4 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s5 +; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s2 +; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s0 +; GFX67-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX67-NEXT: v_add_f32_e32 v2, v4, v3 ; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX67-NEXT: v_add_f32_e32 v3, v6, v5 ; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX67-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX67-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX67-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX67-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v4f16: @@ -2019,48 +2019,48 @@ define amdgpu_vs <4 x half> @load_v4f16(ptr addrspace(6) inreg %p0, ptr addrspac define amdgpu_vs <6 x half> @load_v6f16(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 { ; GFX67-LABEL: load_v6f16: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_mov_b32 s5, 0 -; GFX67-NEXT: s_mov_b32 s4, s1 -; GFX67-NEXT: s_mov_b32 s1, s5 +; GFX67-NEXT: s_mov_b32 s3, 0 +; GFX67-NEXT: s_mov_b32 s2, s1 +; GFX67-NEXT: s_mov_b32 s1, s3 +; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 ; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x8 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s3, s0, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX67-NEXT: s_lshr_b32 s0, s1, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s0 -; GFX67-NEXT: s_lshr_b32 s0, s2, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s0 -; GFX67-NEXT: s_lshr_b32 s0, s4, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s3 -; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s0 -; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s4 -; GFX67-NEXT: s_lshr_b32 s0, s5, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s6 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX67-NEXT: s_lshr_b32 s2, s2, 16 +; GFX67-NEXT: s_lshr_b32 s3, s6, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s3 +; GFX67-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX67-NEXT: s_lshr_b32 s2, s5, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s2 +; GFX67-NEXT: s_lshr_b32 s2, s1, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s2 +; GFX67-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v4, v3 ; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s1 -; GFX67-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX67-NEXT: s_lshr_b32 s1, s6, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s0 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v7 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s5 +; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s0 +; GFX67-NEXT: s_lshr_b32 s1, s4, 16 +; GFX67-NEXT: s_lshr_b32 s0, s0, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s5 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s4 +; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s0 +; GFX67-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX67-NEXT: v_add_f32_e32 v3, v6, v4 ; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s6 -; GFX67-NEXT: v_add_f32_e32 v2, v2, v10 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-NEXT: v_add_f32_e32 v4, v4, v9 -; GFX67-NEXT: v_add_f32_e32 v3, v3, v11 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX67-NEXT: v_add_f32_e32 v5, v5, v8 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v4 -; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX67-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX67-NEXT: v_add_f32_e32 v4, v8, v7 +; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX67-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX67-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX67-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v6f16: @@ -2126,57 +2126,56 @@ define amdgpu_vs <8 x half> @load_v8f16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_mov_b32 s4, s1 ; GFX67-NEXT: s_mov_b32 s1, s5 ; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s6, s0, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s6 ; GFX67-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x8 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX67-NEXT: s_lshr_b32 s0, s1, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s0 -; GFX67-NEXT: s_lshr_b32 s0, s2, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s0 -; GFX67-NEXT: s_lshr_b32 s0, s3, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s0 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s0, s4, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s0 -; GFX67-NEXT: s_lshr_b32 s0, s5, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s4 -; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s0 +; GFX67-NEXT: s_lshr_b32 s11, s3, 16 +; GFX67-NEXT: s_lshr_b32 s12, s7, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s12 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s11 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s6 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s2 +; GFX67-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s7 +; GFX67-NEXT: s_lshr_b32 s10, s2, 16 +; GFX67-NEXT: s_lshr_b32 s13, s6, 16 +; GFX67-NEXT: s_lshr_b32 s9, s1, 16 +; GFX67-NEXT: s_lshr_b32 s12, s5, 16 +; GFX67-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v2, v1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s13 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s10 +; GFX67-NEXT: v_cvt_f16_f32_e32 v6, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v4, v3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s12 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s9 +; GFX67-NEXT: s_lshr_b32 s8, s0, 16 +; GFX67-NEXT: s_lshr_b32 s11, s4, 16 +; GFX67-NEXT: v_cvt_f16_f32_e32 v7, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v2, v1 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v4, v3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s5 ; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s5 -; GFX67-NEXT: s_lshr_b32 s0, s6, 16 -; GFX67-NEXT: v_add_f32_e32 v1, v1, v8 -; GFX67-NEXT: s_lshr_b32 s1, s7, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v15, s0 -; GFX67-NEXT: v_add_f32_e32 v2, v2, v10 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v9 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v13, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v14, s6 -; GFX67-NEXT: v_add_f32_e32 v3, v3, v11 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s4 +; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s11 +; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s8 +; GFX67-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX67-NEXT: v_add_f32_e32 v3, v8, v4 ; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s3 -; GFX67-NEXT: v_cvt_f32_f16_e32 v12, s7 +; GFX67-NEXT: v_add_f32_e32 v4, v10, v9 +; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX67-NEXT: v_add_f32_e32 v4, v4, v15 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-NEXT: v_add_f32_e32 v6, v6, v13 -; GFX67-NEXT: v_add_f32_e32 v5, v5, v14 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX67-NEXT: v_add_f32_e32 v7, v7, v12 -; GFX67-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v5 -; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v6 -; GFX67-NEXT: v_cvt_f16_f32_e32 v5, v7 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX67-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX67-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX67-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX67-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX67-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX67-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX67-NEXT: v_or_b32_e32 v3, v6, v5 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v8f16: @@ -2248,109 +2247,108 @@ define amdgpu_vs <8 x half> @load_v8f16(ptr addrspace(6) inreg %p0, ptr addrspac define amdgpu_vs <16 x half> @load_v16f16(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 { ; GFX67-LABEL: load_v16f16: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_mov_b32 s3, 0 -; GFX67-NEXT: s_mov_b32 s2, s1 -; GFX67-NEXT: s_mov_b32 s1, s3 -; GFX67-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0 -; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s0, s4, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX67-NEXT: s_lshr_b32 s0, s6, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s0 -; GFX67-NEXT: s_lshr_b32 s0, s7, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s0 -; GFX67-NEXT: s_lshr_b32 s0, s8, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s0 -; GFX67-NEXT: s_lshr_b32 s0, s9, 16 -; GFX67-NEXT: s_lshr_b32 s1, s5, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s0 -; GFX67-NEXT: s_lshr_b32 s0, s10, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s5 -; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s6 -; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s7 -; GFX67-NEXT: v_cvt_f32_f16_e32 v12, s0 -; GFX67-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x10 -; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s8 -; GFX67-NEXT: v_cvt_f32_f16_e32 v13, s10 -; GFX67-NEXT: s_lshr_b32 s8, s11, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v14, s8 +; GFX67-NEXT: s_mov_b32 s9, 0 +; GFX67-NEXT: s_mov_b32 s8, s1 +; GFX67-NEXT: s_mov_b32 s1, s9 +; GFX67-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x0 +; GFX67-NEXT: s_load_dwordx8 s[8:15], s[8:9], 0x10 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s7 -; GFX67-NEXT: s_lshr_b32 s7, s7, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s6 -; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s7 -; GFX67-NEXT: v_cvt_f32_f16_e32 v15, s11 -; GFX67-NEXT: s_lshr_b32 s12, s5, 16 -; GFX67-NEXT: v_add_f32_e32 v13, v13, v19 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s12 -; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s9 -; GFX67-NEXT: s_lshr_b32 s13, s6, 16 -; GFX67-NEXT: v_add_f32_e32 v14, v14, v18 -; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s5 -; GFX67-NEXT: v_add_f32_e32 v15, v15, v17 -; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s13 -; GFX67-NEXT: s_lshr_b32 s11, s4, 16 -; GFX67-NEXT: v_add_f32_e32 v10, v10, v19 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s3 -; GFX67-NEXT: v_add_f32_e32 v11, v11, v18 -; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s11 -; GFX67-NEXT: v_add_f32_e32 v12, v12, v17 -; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s4 -; GFX67-NEXT: s_lshr_b32 s9, s2, 16 -; GFX67-NEXT: v_add_f32_e32 v7, v7, v19 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s9 -; GFX67-NEXT: s_lshr_b32 s10, s3, 16 -; GFX67-NEXT: v_add_f32_e32 v8, v8, v18 -; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s2 -; GFX67-NEXT: v_add_f32_e32 v9, v9, v17 -; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s10 -; GFX67-NEXT: s_lshr_b32 s8, s0, 16 +; GFX67-NEXT: s_lshr_b32 s23, s7, 16 +; GFX67-NEXT: s_lshr_b32 s28, s15, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v0, s28 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s23 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s7 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s14 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s6 +; GFX67-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s15 +; GFX67-NEXT: s_lshr_b32 s22, s6, 16 +; GFX67-NEXT: s_lshr_b32 s29, s14, 16 +; GFX67-NEXT: s_lshr_b32 s21, s5, 16 +; GFX67-NEXT: s_lshr_b32 s28, s13, 16 +; GFX67-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v2, v1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s29 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s22 +; GFX67-NEXT: v_cvt_f16_f32_e32 v8, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v4, v3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s28 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s21 +; GFX67-NEXT: v_cvt_f16_f32_e32 v6, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v2, v1 +; GFX67-NEXT: v_cvt_f16_f32_e32 v9, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v4, v3 +; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s13 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s5 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s12 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s4 +; GFX67-NEXT: s_lshr_b32 s20, s4, 16 +; GFX67-NEXT: s_lshr_b32 s23, s12, 16 +; GFX67-NEXT: s_lshr_b32 s19, s3, 16 +; GFX67-NEXT: s_lshr_b32 s27, s11, 16 +; GFX67-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v2, v1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s23 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s20 +; GFX67-NEXT: v_cvt_f16_f32_e32 v10, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v4, v3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s27 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s19 +; GFX67-NEXT: v_cvt_f16_f32_e32 v11, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v2, v1 +; GFX67-NEXT: v_cvt_f16_f32_e32 v12, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v4, v3 +; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s11 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s10 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s2 +; GFX67-NEXT: s_lshr_b32 s18, s2, 16 +; GFX67-NEXT: s_lshr_b32 s26, s10, 16 +; GFX67-NEXT: s_lshr_b32 s17, s1, 16 +; GFX67-NEXT: s_lshr_b32 s25, s9, 16 +; GFX67-NEXT: v_lshlrev_b32_e32 v13, 16, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v2, v1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s26 +; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s18 +; GFX67-NEXT: v_cvt_f16_f32_e32 v14, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v4, v3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s25 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s17 +; GFX67-NEXT: s_lshr_b32 s16, s0, 16 +; GFX67-NEXT: s_lshr_b32 s24, s8, 16 +; GFX67-NEXT: v_cvt_f16_f32_e32 v15, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v2, v1 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v0 +; GFX67-NEXT: v_add_f32_e32 v0, v4, v3 +; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s9 +; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s8 ; GFX67-NEXT: v_cvt_f32_f16_e32 v16, s0 -; GFX67-NEXT: s_lshr_b32 s0, s1, 16 -; GFX67-NEXT: v_add_f32_e32 v4, v4, v19 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s8 -; GFX67-NEXT: v_add_f32_e32 v5, v5, v18 -; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s0 -; GFX67-NEXT: v_add_f32_e32 v6, v6, v17 -; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s1 -; GFX67-NEXT: v_add_f32_e32 v1, v1, v19 -; GFX67-NEXT: v_add_f32_e32 v2, v2, v18 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v16 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX67-NEXT: v_add_f32_e32 v3, v3, v17 +; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s24 +; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s16 +; GFX67-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX67-NEXT: v_add_f32_e32 v3, v16, v4 ; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX67-NEXT: v_add_f32_e32 v4, v18, v17 +; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX67-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v5 -; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v6 -; GFX67-NEXT: v_cvt_f16_f32_e32 v5, v7 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX67-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX67-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v8 -; GFX67-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX67-NEXT: v_cvt_f16_f32_e32 v5, v9 -; GFX67-NEXT: v_cvt_f16_f32_e32 v6, v10 -; GFX67-NEXT: v_cvt_f16_f32_e32 v7, v11 -; GFX67-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX67-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX67-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX67-NEXT: v_cvt_f16_f32_e32 v6, v12 -; GFX67-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX67-NEXT: v_cvt_f16_f32_e32 v7, v13 -; GFX67-NEXT: v_cvt_f16_f32_e32 v8, v14 -; GFX67-NEXT: v_cvt_f16_f32_e32 v9, v15 -; GFX67-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX67-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX67-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; GFX67-NEXT: v_or_b32_e32 v7, v9, v7 +; GFX67-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; GFX67-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX67-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX67-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX67-NEXT: v_or_b32_e32 v2, v15, v2 +; GFX67-NEXT: v_or_b32_e32 v3, v14, v13 +; GFX67-NEXT: v_or_b32_e32 v4, v11, v4 +; GFX67-NEXT: v_or_b32_e32 v5, v10, v5 +; GFX67-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX67-NEXT: v_or_b32_e32 v7, v8, v7 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll index 21abcbd4f5edc..76583e806b805 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll @@ -1737,14 +1737,18 @@ define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1, ; GFX7-SDAG-LABEL: fmul_select_v2f16_test3: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v6, 0x3c00 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v7, 0x4000 ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc -; GFX7-SDAG-NEXT: v_mul_f32_e32 v2, v5, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v2, v3, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1899,14 +1903,18 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1, ; GFX7-SDAG-LABEL: fmul_select_v2f16_test4: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v6, 0x3c00 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v7, 0x3800 ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc -; GFX7-SDAG-NEXT: v_mul_f32_e32 v2, v5, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v2, v3, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2129,11 +2137,12 @@ define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX7-SDAG-LABEL: fmul_select_f16_test6: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x40400000 -; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0xc1000000 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x4200 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0xc800 ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2238,10 +2247,12 @@ define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX7-SDAG-LABEL: fmul_select_f16_test7: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x41000000 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0xc400 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0x4800 ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, -4.0, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2346,10 +2357,11 @@ define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX7-SDAG-LABEL: fmul_select_f16_test8: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x8000 ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2428,7 +2440,6 @@ define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX7-SDAG-LABEL: fmul_select_f16_test9: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index d9b23d43d593d..305ce4e6d4c45 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -654,8 +654,8 @@ define float @divergent_vec_f16_LL(half %a, half %b) { ; GCN-LABEL: divergent_vec_f16_LL: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll index 78a00dd51c2b2..d879ebede164e 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -398,9 +398,9 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -414,26 +414,23 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_or_b32_e32 v5, v4, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execnz .LBB2_3 ; SI-NEXT: .LBB2_2: ; %T ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -443,40 +440,32 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: .LBB2_3: ; %exit -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000 -; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; SI-NEXT: v_mov_b32_e32 v3, 0x3d00 +; SI-NEXT: v_mov_b32_e32 v4, 0x3900 +; SI-NEXT: v_mov_b32_e32 v5, 0x3d000000 +; SI-NEXT: v_mov_b32_e32 v6, 0x39000000 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB2_4: ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: s_branch .LBB2_2 ; @@ -1083,9 +1072,9 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1115,26 +1104,23 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_or_b32_e32 v5, v4, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execnz .LBB5_3 ; SI-NEXT: .LBB5_2: ; %T ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1160,40 +1146,32 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: .LBB5_3: ; %exit -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000 -; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; SI-NEXT: v_mov_b32_e32 v3, 0x3d00 +; SI-NEXT: v_mov_b32_e32 v4, 0x3900 +; SI-NEXT: v_mov_b32_e32 v5, 0x3d000000 +; SI-NEXT: v_mov_b32_e32 v6, 0x39000000 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB5_4: ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: s_branch .LBB5_2 ; @@ -1739,21 +1717,21 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s36, s38 ; SI-NEXT: s_mov_b32 s37, s38 -; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1771,138 +1749,104 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 -; SI-NEXT: v_or_b32_e32 v7, v11, v12 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 -; SI-NEXT: v_or_b32_e32 v11, v8, v14 -; SI-NEXT: v_or_b32_e32 v6, v6, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v8, v8, v12 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB8_3 ; SI-NEXT: s_branch .LBB8_4 ; SI-NEXT: .LBB8_2: -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB8_3: ; %T ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s36, s38 ; SI-NEXT: s_mov_b32 s37, s38 -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:18 glc +; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:18 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:20 glc +; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:20 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:22 glc +; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:22 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:24 glc +; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:24 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:26 glc +; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:26 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:28 glc +; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:28 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[36:39], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v0, v8, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_or_b32_e32 v3, v3, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: .LBB8_4: ; %exit -; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_mov_b32_e32 v8, 0x3fa00000 -; SI-NEXT: v_mov_b32_e32 v9, 0x3f200000 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v5 -; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v4 -; SI-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v7 -; SI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mov_b32_e32 v8, 0x3d00 +; SI-NEXT: v_mov_b32_e32 v9, 0x3900 +; SI-NEXT: v_mov_b32_e32 v10, 0x3d000000 +; SI-NEXT: v_mov_b32_e32 v11, 0x39000000 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v12, v8, v9, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v6 -; SI-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v7 ; SI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v6, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v2 -; SI-NEXT: v_or_b32_e32 v2, v5, v3 -; SI-NEXT: v_or_b32_e32 v3, v8, v4 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v5 +; SI-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v4 +; SI-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 +; SI-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc +; SI-NEXT: v_or_b32_e32 v0, v4, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_or_b32_e32 v3, v12, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_16xf16_extract_8xf16_0: diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index 614200803d6f1..edae2c393e5f0 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -557,39 +557,30 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_cmp_eq_u32 s8, 1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 3 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 5 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 6 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 7 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; SI-NEXT: buffer_store_short v0, v[6:7], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; @@ -730,87 +721,71 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; SI-NEXT: s_load_dword s8, s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; SI-NEXT: v_mov_b32_e32 v9, 0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 5, v0 +; SI-NEXT: v_mov_b32_e32 v6, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 -; SI-NEXT: buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64 offset:16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 1, v0 +; SI-NEXT: buffer_load_dwordx4 v[1:4], v[5:6], s[4:7], 0 addr64 +; SI-NEXT: v_lshlrev_b32_e32 v9, 1, v0 +; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: buffer_load_dwordx4 v[5:8], v[5:6], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_cmp_eq_u32 s8, 1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v18 -; SI-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 3 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 5 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 6 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 7 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 8 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 9 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 10 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 11 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 12 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 13 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 14 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 15 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, v[8:9], s[0:3], 0 addr64 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc +; SI-NEXT: buffer_store_short v0, v[9:10], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_extractelement_v16f16_dynamic_sgpr: diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index e9014e212b76f..dfd1fa6020eab 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -673,17 +673,17 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v0, v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v1, |v0| +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; CI-NEXT: v_mul_f32_e32 v0, 4.0, v0 -; CI-NEXT: v_add_f32_e32 v1, 2.0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: flat_store_short v[0:1], v0 -; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_add_f32_e32 v0, 2.0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: flat_store_short v[0:1], v1 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: flat_store_short v[0:1], v0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_endpgm ; ; VI-LABEL: v_extract_fabs_fold_v2f16: @@ -788,7 +788,7 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; CI-NEXT: flat_load_dword v0, v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_bfe_u32 v1, v0, 16, 15 -; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; CI-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; CI-NEXT: flat_store_short v[0:1], v0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_short v[0:1], v1 diff --git a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll index f45070cbe88ee..6eec710a4c24e 100644 --- a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll @@ -756,6 +756,8 @@ define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs() # ; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4 +; GCN-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1 ; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, v3, v2 ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0 @@ -781,6 +783,8 @@ define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs() # ; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4 +; GCN-FASTFMA-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3 ; GCN-FASTFMA-NEXT: v_sub_f32_e32 v1, v0, v2 ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -807,6 +811,8 @@ define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs() # ; GCN-SLOWFMA-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v1, v3, v4 +; GCN-SLOWFMA-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-SLOWFMA-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-SLOWFMA-NEXT: v_sub_f32_e32 v1, v0, v2 ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -846,6 +852,8 @@ define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs() # ; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4 +; GCN-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1 ; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, v2, v3 ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0 @@ -871,6 +879,8 @@ define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs() # ; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4 +; GCN-FASTFMA-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3 ; GCN-FASTFMA-NEXT: v_sub_f32_e32 v1, v2, v0 ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -897,6 +907,8 @@ define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs() # ; GCN-SLOWFMA-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v1, v3, v4 +; GCN-SLOWFMA-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-SLOWFMA-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-SLOWFMA-NEXT: v_sub_f32_e32 v1, v2, v0 ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll index e57f0b6f33439..0b7533e2ecced 100644 --- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll @@ -464,19 +464,19 @@ define amdgpu_kernel void @fadd_v2f16( ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, v3, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v0, v2, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -634,15 +634,15 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v1, 2.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v0, 2.0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -773,15 +773,15 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, 2.0, v1 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index a723a67498d05..805b1421f94d0 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -2487,19 +2487,19 @@ define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_or_b32_e32 v0, v0, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; CI-NEXT: v_or_b32_e32 v1, v1, v2 +; CI-NEXT: v_or_b32_e32 v1, v1, v3 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v4f16: @@ -2615,7 +2615,7 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, 0x7fc00000 +; CI-NEXT: v_cvt_f16_f32_e32 v1, 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -2912,7 +2912,7 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, 0x7fc00000 +; CI-NEXT: v_cvt_f16_f32_e32 v1, 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -2964,7 +2964,7 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v2, 0x7fc00000 +; CI-NEXT: v_cvt_f16_f32_e32 v2, 0 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 @@ -3023,7 +3023,7 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v3, 0x7fc00000 +; CI-NEXT: v_cvt_f16_f32_e32 v3, 0 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -3085,26 +3085,26 @@ define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v6f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_or_b32_e32 v0, v0, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; CI-NEXT: v_or_b32_e32 v1, v1, v3 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; CI-NEXT: v_or_b32_e32 v0, v0, v5 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -3150,33 +3150,33 @@ define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; CI-NEXT: v_or_b32_e32 v0, v0, v4 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; CI-NEXT: v_or_b32_e32 v1, v1, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v6 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; CI-NEXT: v_or_b32_e32 v0, v0, v4 +; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; CI-NEXT: v_or_b32_e32 v1, v1, v6 ; CI-NEXT: v_or_b32_e32 v2, v2, v4 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; CI-NEXT: v_or_b32_e32 v3, v3, v4 +; CI-NEXT: v_or_b32_e32 v3, v3, v5 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v8f16: @@ -3229,47 +3229,47 @@ define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v12f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; CI-NEXT: v_or_b32_e32 v0, v0, v6 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; CI-NEXT: v_or_b32_e32 v1, v1, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v8 -; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; CI-NEXT: v_or_b32_e32 v2, v2, v6 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; CI-NEXT: v_or_b32_e32 v3, v3, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v11 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; CI-NEXT: v_or_b32_e32 v0, v0, v7 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; CI-NEXT: v_or_b32_e32 v4, v4, v6 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; CI-NEXT: v_or_b32_e32 v2, v2, v7 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; CI-NEXT: v_or_b32_e32 v1, v1, v8 +; CI-NEXT: v_or_b32_e32 v3, v3, v10 +; CI-NEXT: v_or_b32_e32 v4, v4, v7 ; CI-NEXT: v_or_b32_e32 v5, v5, v6 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -3334,61 +3334,61 @@ define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; CI-NEXT: v_or_b32_e32 v0, v0, v8 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; CI-NEXT: v_or_b32_e32 v1, v1, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v10 -; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 ; CI-NEXT: v_or_b32_e32 v2, v2, v8 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; CI-NEXT: v_or_b32_e32 v3, v3, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v12 -; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v15 +; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; CI-NEXT: v_or_b32_e32 v4, v4, v8 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; CI-NEXT: v_or_b32_e32 v5, v5, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; CI-NEXT: v_or_b32_e32 v1, v1, v9 +; CI-NEXT: v_or_b32_e32 v3, v3, v12 +; CI-NEXT: v_or_b32_e32 v5, v5, v13 ; CI-NEXT: v_or_b32_e32 v6, v6, v8 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; CI-NEXT: v_or_b32_e32 v7, v7, v8 +; CI-NEXT: v_or_b32_e32 v7, v7, v11 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v16f16: @@ -3486,117 +3486,117 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_or_b32_e32 v0, v0, v16 ; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 ; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; CI-NEXT: v_or_b32_e32 v1, v1, v17 ; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 -; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; CI-NEXT: v_or_b32_e32 v2, v2, v18 ; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_or_b32_e32 v3, v3, v19 +; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: v_or_b32_e32 v3, v3, v19 -; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; CI-NEXT: v_or_b32_e32 v4, v4, v16 ; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 ; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; CI-NEXT: v_or_b32_e32 v5, v5, v17 ; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 -; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; CI-NEXT: v_or_b32_e32 v6, v6, v18 ; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_or_b32_e32 v7, v7, v19 +; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: v_or_b32_e32 v7, v7, v19 -; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: v_or_b32_e32 v8, v8, v16 ; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 ; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; CI-NEXT: v_or_b32_e32 v9, v9, v17 ; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v13 -; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; CI-NEXT: v_or_b32_e32 v10, v10, v18 -; CI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; CI-NEXT: v_or_b32_e32 v11, v11, v19 +; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; CI-NEXT: v_or_b32_e32 v11, v11, v18 -; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; CI-NEXT: v_or_b32_e32 v12, v12, v16 -; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; CI-NEXT: v_or_b32_e32 v13, v13, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v19 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v18 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; CI-NEXT: v_or_b32_e32 v12, v12, v16 +; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; CI-NEXT: v_or_b32_e32 v13, v13, v17 ; CI-NEXT: v_or_b32_e32 v14, v14, v16 -; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; CI-NEXT: v_or_b32_e32 v15, v15, v16 +; CI-NEXT: v_or_b32_e32 v15, v15, v18 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v32f16: diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll index 17cc51d08a1e2..5d3f69c84b902 100644 --- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -2455,26 +2455,26 @@ define amdgpu_kernel void @fcmp_v2f16_lt( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -2650,26 +2650,26 @@ define amdgpu_kernel void @fcmp_v2f16_eq( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -2844,26 +2844,26 @@ define amdgpu_kernel void @fcmp_v2f16_le( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_le_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_le_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_le_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -3038,26 +3038,26 @@ define amdgpu_kernel void @fcmp_v2f16_gt( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_gt_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -3233,26 +3233,26 @@ define amdgpu_kernel void @fcmp_v2f16_lg( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_lg_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_lg_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -3428,26 +3428,26 @@ define amdgpu_kernel void @fcmp_v2f16_ge( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_ge_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_ge_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_ge_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -3623,26 +3623,26 @@ define amdgpu_kernel void @fcmp_v2f16_o( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_o_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -3818,26 +3818,26 @@ define amdgpu_kernel void @fcmp_v2f16_u( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_u_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_u_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_u_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -4012,26 +4012,26 @@ define amdgpu_kernel void @fcmp_v2f16_nge( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_nge_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -4206,26 +4206,26 @@ define amdgpu_kernel void @fcmp_v2f16_nlg( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -4401,26 +4401,26 @@ define amdgpu_kernel void @fcmp_v2f16_ngt( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -4595,26 +4595,26 @@ define amdgpu_kernel void @fcmp_v2f16_nle( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_nle_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_nle_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -4789,26 +4789,26 @@ define amdgpu_kernel void @fcmp_v2f16_neq( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_neq_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_neq_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_neq_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -4983,26 +4983,26 @@ define amdgpu_kernel void @fcmp_v2f16_nlt( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v2, v3 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index a8703d5d6e51d..3f6750546618f 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -741,21 +741,15 @@ define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) { ; GCN-LABEL: v_copysign_f16_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_brev_b32 s4, -2 +; GCN-NEXT: s_movk_i32 s4, 0x7fff ; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_f16_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: s_brev_b32 s4, -2 +; GFX7-NEXT: s_movk_i32 s4, 0x7fff ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_f16_bf16: @@ -791,22 +785,16 @@ define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) { define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf16) { ; GCN-LABEL: s_copysign_f16_bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, s1 -; GCN-NEXT: s_brev_b32 s0, -2 -; GCN-NEXT: v_bfi_b32 v0, s0, v0, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_and_b32 s1, s1, 0x8000 +; GCN-NEXT: s_and_b32 s0, s0, 0x7fff +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_f16_bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s1 -; GFX7-NEXT: s_brev_b32 s0, -2 -; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_and_b32 s1, s1, 0x8000 +; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff +; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_f16_bf16: diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index b80204e70851e..23753bc5970dd 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -16,12 +16,9 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_ps i16 @s_copysign_f16(half inreg %mag, half inreg %sign) { ; SI-LABEL: s_copysign_f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s1, s1, 0xffff8000 +; SI-NEXT: s_and_b32 s0, s0, 0x7fff +; SI-NEXT: s_or_b32 s0, s0, s1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_f16: @@ -141,8 +138,7 @@ define amdgpu_ps i16 @s_test_copysign_f16_10.0(half inreg %mag) { define amdgpu_ps i16 @s_test_copysign_f16_neg1(half inreg %mag) { ; SI-LABEL: s_test_copysign_f16_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_and_b32 s0, s0, 0x7fff -; SI-NEXT: s_bitset1_b32 s0, 15 +; SI-NEXT: s_or_b32 s0, s0, 0xffff8000 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_test_copysign_f16_neg1: @@ -167,8 +163,7 @@ define amdgpu_ps i16 @s_test_copysign_f16_neg1(half inreg %mag) { define amdgpu_ps i16 @s_test_copysign_f16_neg10(half inreg %mag) { ; SI-LABEL: s_test_copysign_f16_neg10: ; SI: ; %bb.0: -; SI-NEXT: s_and_b32 s0, s0, 0x7fff -; SI-NEXT: s_bitset1_b32 s0, 15 +; SI-NEXT: s_or_b32 s0, s0, 0xffff8000 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_test_copysign_f16_neg10: @@ -348,11 +343,8 @@ define half @v_copysign_f16(half %mag, half %sign) { ; SI-LABEL: v_copysign_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: s_movk_i32 s4, 0x7fff ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_f16: @@ -484,8 +476,7 @@ define half @v_test_copysign_f16_neg1(half %mag) { ; SI-LABEL: v_test_copysign_f16_neg1: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; SI-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_test_copysign_f16_neg1: @@ -519,8 +510,7 @@ define half @v_test_copysign_f16_neg10(half %mag) { ; SI-LABEL: v_test_copysign_f16_neg10: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; SI-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_test_copysign_f16_neg10: @@ -739,10 +729,10 @@ define half @v_copysign_out_f16_mag_f16_sign_f32(half %mag, float %sign) { ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: @@ -777,10 +767,10 @@ define half @v_copysign_out_f16_mag_f16_sign_f64(half %mag, double %sign) { ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v2 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: @@ -816,11 +806,8 @@ define half @v_copysign_out_f16_mag_f32_sign_f16(float %mag, half %sign) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_movk_i32 s4, 0x7fff ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_f16_mag_f32_sign_f16: @@ -869,48 +856,41 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) { ; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3 -; SI-NEXT: v_bfe_u32 v4, v1, 20, 11 -; SI-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NEXT: v_bfe_u32 v1, v1, 20, 11 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_sub_i32_e32 v5, vcc, s4, v4 +; SI-NEXT: v_sub_i32_e32 v4, vcc, 0x3f1, v1 ; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0 -; SI-NEXT: v_med3_i32 v5, v5, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v6, v5, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, v5, v6 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v3 +; SI-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v3 ; SI-NEXT: s_movk_i32 s4, 0xfc10 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 12, v4 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_or_b32_e32 v5, v0, v5 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 -; SI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; SI-NEXT: v_and_b32_e32 v5, 7, v3 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 -; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, s4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v1 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 +; SI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; SI-NEXT: v_and_b32_e32 v4, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 ; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; SI-NEXT: v_mov_b32_e32 v5, 0x7c00 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 -; SI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; SI-NEXT: v_mov_b32_e32 v6, 0x7e00 +; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1 +; SI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; SI-NEXT: v_mov_b32_e32 v5, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_movk_i32 s4, 0x40f -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: s_movk_i32 s4, 0x7fff ; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_f16_mag_f64_sign_f16: @@ -1105,53 +1085,47 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) { define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, half inreg %sign) { ; SI-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_and_b32 s2, s1, 0x1ff -; SI-NEXT: s_or_b32 s0, s2, s0 -; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; SI-NEXT: s_and_b32 s3, s1, 0x1ff +; SI-NEXT: s_or_b32 s0, s3, s0 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; SI-NEXT: s_lshr_b32 s0, s1, 8 -; SI-NEXT: s_bfe_u32 s3, s1, 0xb0014 +; SI-NEXT: s_bfe_u32 s1, s1, 0xb0014 ; SI-NEXT: s_and_b32 s0, s0, 0xffe -; SI-NEXT: v_readfirstlane_b32 s2, v1 -; SI-NEXT: s_sub_i32 s4, 0x3f1, s3 -; SI-NEXT: s_or_b32 s0, s0, s2 -; SI-NEXT: v_med3_i32 v1, s4, 0, 13 -; SI-NEXT: s_or_b32 s2, s0, 0x1000 -; SI-NEXT: v_readfirstlane_b32 s4, v1 -; SI-NEXT: s_lshr_b32 s5, s2, s4 +; SI-NEXT: v_readfirstlane_b32 s3, v0 +; SI-NEXT: s_sub_i32 s4, 0x3f1, s1 +; SI-NEXT: s_or_b32 s0, s0, s3 +; SI-NEXT: v_med3_i32 v0, s4, 0, 13 +; SI-NEXT: s_or_b32 s3, s0, 0x1000 +; SI-NEXT: v_readfirstlane_b32 s4, v0 +; SI-NEXT: s_lshr_b32 s5, s3, s4 ; SI-NEXT: s_lshl_b32 s4, s5, s4 -; SI-NEXT: s_cmp_lg_u32 s4, s2 -; SI-NEXT: s_cselect_b32 s2, 1, 0 -; SI-NEXT: s_addk_i32 s3, 0xfc10 -; SI-NEXT: s_lshl_b32 s4, s3, 12 -; SI-NEXT: s_or_b32 s2, s5, s2 +; SI-NEXT: s_cmp_lg_u32 s4, s3 +; SI-NEXT: s_cselect_b32 s3, 1, 0 +; SI-NEXT: s_addk_i32 s1, 0xfc10 +; SI-NEXT: s_lshl_b32 s4, s1, 12 +; SI-NEXT: s_or_b32 s3, s5, s3 ; SI-NEXT: s_or_b32 s4, s0, s4 -; SI-NEXT: s_cmp_lt_i32 s3, 1 -; SI-NEXT: s_cselect_b32 s2, s2, s4 -; SI-NEXT: s_and_b32 s4, s2, 7 +; SI-NEXT: s_cmp_lt_i32 s1, 1 +; SI-NEXT: s_cselect_b32 s3, s3, s4 +; SI-NEXT: s_and_b32 s4, s3, 7 ; SI-NEXT: s_cmp_gt_i32 s4, 5 ; SI-NEXT: s_cselect_b32 s5, 1, 0 ; SI-NEXT: s_cmp_eq_u32 s4, 3 ; SI-NEXT: s_cselect_b32 s4, 1, 0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshr_b32 s2, s2, 2 -; SI-NEXT: s_add_i32 s2, s2, s4 -; SI-NEXT: s_cmp_lt_i32 s3, 31 -; SI-NEXT: s_cselect_b32 s2, s2, 0x7c00 +; SI-NEXT: s_lshr_b32 s3, s3, 2 +; SI-NEXT: s_add_i32 s3, s3, s4 +; SI-NEXT: s_cmp_lt_i32 s1, 31 +; SI-NEXT: s_cselect_b32 s3, s3, 0x7c00 ; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_movk_i32 s0, 0x7e00 ; SI-NEXT: s_cselect_b32 s0, s0, 0x7c00 -; SI-NEXT: s_cmpk_eq_i32 s3, 0x40f -; SI-NEXT: s_cselect_b32 s0, s0, s2 -; SI-NEXT: s_lshr_b32 s1, s1, 16 -; SI-NEXT: s_and_b32 s1, s1, 0x8000 -; SI-NEXT: s_or_b32 s0, s1, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_cmpk_eq_i32 s1, 0x40f +; SI-NEXT: s_cselect_b32 s0, s0, s3 +; SI-NEXT: s_and_b32 s0, s0, 0x7fff +; SI-NEXT: s_and_b32 s1, s2, 0xffff8000 +; SI-NEXT: s_or_b32 s0, s0, s1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_out_f16_mag_f64_sign_f16: @@ -1363,19 +1337,14 @@ define amdgpu_ps i32 @s_copysign_v2f16(<2 x half> inreg %arg_mag, <2 x half> inr ; SI-LABEL: s_copysign_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_lshr_b32 s2, s1, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s0, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s3, s0, 0x7fff +; SI-NEXT: s_and_b32 s2, s2, 0x8000 +; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; SI-NEXT: s_and_b32 s1, s1, 0x8000 +; SI-NEXT: s_or_b32 s0, s0, s2 +; SI-NEXT: s_or_b32 s1, s3, s1 +; SI-NEXT: s_lshl_b32 s0, s0, 16 +; SI-NEXT: s_or_b32 s0, s1, s0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v2f16: @@ -1411,25 +1380,18 @@ define amdgpu_ps i32 @s_copysign_v2f16(<2 x half> inreg %arg_mag, <2 x half> inr define amdgpu_ps <3 x i16> @s_copysign_v3f16(<3 x half> inreg %arg_mag, <3 x half> inreg %arg_sign) { ; SI-LABEL: s_copysign_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_lshr_b32 s4, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s0, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v2, s0, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 -; SI-NEXT: v_readfirstlane_b32 s1, v2 +; SI-NEXT: s_and_b32 s4, s2, 0x8000 +; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_and_b32 s5, s0, 0x7fff +; SI-NEXT: s_and_b32 s2, s2, 0x8000 +; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; SI-NEXT: s_or_b32 s0, s0, s2 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s3, s3, 0x8000 +; SI-NEXT: s_and_b32 s1, s1, 0x7fff +; SI-NEXT: s_lshl_b32 s0, s0, 16 +; SI-NEXT: s_or_b32 s1, s1, s3 +; SI-NEXT: s_or_b32 s0, s4, s0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v3f16: @@ -1476,33 +1438,24 @@ define amdgpu_ps <3 x i16> @s_copysign_v3f16(<3 x half> inreg %arg_mag, <3 x hal define amdgpu_ps <2 x i32> @s_copysign_v4f16(<4 x half> inreg %arg_mag, <4 x half> inreg %arg_sign) { ; SI-LABEL: s_copysign_v4f16: ; SI: ; %bb.0: +; SI-NEXT: s_lshr_b32 s5, s3, 16 +; SI-NEXT: s_and_b32 s6, s1, 0x7fff +; SI-NEXT: s_and_b32 s5, s5, 0x8000 +; SI-NEXT: s_bfe_u32 s1, s1, 0xf0010 +; SI-NEXT: s_and_b32 s3, s3, 0x8000 +; SI-NEXT: s_or_b32 s1, s1, s5 +; SI-NEXT: s_or_b32 s3, s6, s3 +; SI-NEXT: s_lshl_b32 s1, s1, 16 ; SI-NEXT: s_lshr_b32 s4, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_lshr_b32 s4, s3, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s1, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s1 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v2, s0, v3, v2 -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_bfi_b32 v3, s0, v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s0, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 -; SI-NEXT: v_readfirstlane_b32 s1, v2 +; SI-NEXT: s_or_b32 s1, s3, s1 +; SI-NEXT: s_and_b32 s2, s2, 0x8000 +; SI-NEXT: s_and_b32 s3, s0, 0x7fff +; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: s_and_b32 s3, s4, 0x8000 +; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; SI-NEXT: s_or_b32 s0, s0, s3 +; SI-NEXT: s_lshl_b32 s0, s0, 16 +; SI-NEXT: s_or_b32 s0, s2, s0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v4f16: @@ -1549,59 +1502,42 @@ define amdgpu_ps <2 x i32> @s_copysign_v4f16(<4 x half> inreg %arg_mag, <4 x hal define amdgpu_ps <4 x i32> @s_copysign_v8f16(<8 x half> inreg %arg_mag, <8 x half> inreg %arg_sign) { ; SI-LABEL: s_copysign_v8f16: ; SI: ; %bb.0: +; SI-NEXT: s_lshr_b32 s11, s7, 16 +; SI-NEXT: s_and_b32 s12, s3, 0x7fff +; SI-NEXT: s_and_b32 s11, s11, 0x8000 +; SI-NEXT: s_bfe_u32 s3, s3, 0xf0010 +; SI-NEXT: s_and_b32 s7, s7, 0x8000 +; SI-NEXT: s_or_b32 s3, s3, s11 +; SI-NEXT: s_or_b32 s7, s12, s7 +; SI-NEXT: s_lshl_b32 s3, s3, 16 +; SI-NEXT: s_lshr_b32 s10, s6, 16 +; SI-NEXT: s_or_b32 s3, s7, s3 +; SI-NEXT: s_and_b32 s6, s6, 0x8000 +; SI-NEXT: s_and_b32 s7, s2, 0x7fff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s10, 0x8000 +; SI-NEXT: s_bfe_u32 s2, s2, 0xf0010 +; SI-NEXT: s_or_b32 s2, s2, s7 +; SI-NEXT: s_lshl_b32 s2, s2, 16 +; SI-NEXT: s_lshr_b32 s9, s5, 16 +; SI-NEXT: s_or_b32 s2, s6, s2 +; SI-NEXT: s_and_b32 s5, s5, 0x8000 +; SI-NEXT: s_and_b32 s6, s1, 0x7fff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s9, 0x8000 +; SI-NEXT: s_bfe_u32 s1, s1, 0xf0010 +; SI-NEXT: s_or_b32 s1, s1, s6 +; SI-NEXT: s_lshl_b32 s1, s1, 16 ; SI-NEXT: s_lshr_b32 s8, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s8 -; SI-NEXT: s_lshr_b32 s8, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; SI-NEXT: s_lshr_b32 s8, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 -; SI-NEXT: s_lshr_b32 s8, s1, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 -; SI-NEXT: s_lshr_b32 s8, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: s_lshr_b32 s8, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 -; SI-NEXT: s_lshr_b32 s8, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 -; SI-NEXT: s_lshr_b32 s8, s3, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s3 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v6, s0, v7, v6 -; SI-NEXT: v_bfi_b32 v4, s0, v5, v4 -; SI-NEXT: v_bfi_b32 v2, s0, v3, v2 -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_bfi_b32 v7, s0, v15, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_bfi_b32 v5, s0, v13, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_bfi_b32 v3, s0, v11, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s0, v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 -; SI-NEXT: v_readfirstlane_b32 s1, v2 -; SI-NEXT: v_readfirstlane_b32 s2, v4 -; SI-NEXT: v_readfirstlane_b32 s3, v6 +; SI-NEXT: s_or_b32 s1, s5, s1 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: s_and_b32 s5, s0, 0x7fff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s8, 0x8000 +; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; SI-NEXT: s_or_b32 s0, s0, s5 +; SI-NEXT: s_lshl_b32 s0, s0, 16 +; SI-NEXT: s_or_b32 s0, s4, s0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v8f16: @@ -1671,111 +1607,78 @@ define amdgpu_ps <4 x i32> @s_copysign_v8f16(<8 x half> inreg %arg_mag, <8 x hal define amdgpu_ps <8 x i32> @s_copysign_v16f16(<16 x half> inreg %arg_mag, <16 x half> inreg %arg_sign) { ; SI-LABEL: s_copysign_v16f16: ; SI: ; %bb.0: +; SI-NEXT: s_lshr_b32 s23, s15, 16 +; SI-NEXT: s_and_b32 s24, s7, 0x7fff +; SI-NEXT: s_and_b32 s23, s23, 0x8000 +; SI-NEXT: s_bfe_u32 s7, s7, 0xf0010 +; SI-NEXT: s_and_b32 s15, s15, 0x8000 +; SI-NEXT: s_or_b32 s7, s7, s23 +; SI-NEXT: s_or_b32 s15, s24, s15 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshr_b32 s22, s14, 16 +; SI-NEXT: s_or_b32 s7, s15, s7 +; SI-NEXT: s_and_b32 s14, s14, 0x8000 +; SI-NEXT: s_and_b32 s15, s6, 0x7fff +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s22, 0x8000 +; SI-NEXT: s_bfe_u32 s6, s6, 0xf0010 +; SI-NEXT: s_or_b32 s6, s6, s15 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshr_b32 s21, s13, 16 +; SI-NEXT: s_or_b32 s6, s14, s6 +; SI-NEXT: s_and_b32 s13, s13, 0x8000 +; SI-NEXT: s_and_b32 s14, s5, 0x7fff +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s21, 0x8000 +; SI-NEXT: s_bfe_u32 s5, s5, 0xf0010 +; SI-NEXT: s_or_b32 s5, s5, s14 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshr_b32 s20, s12, 16 +; SI-NEXT: s_or_b32 s5, s13, s5 +; SI-NEXT: s_and_b32 s12, s12, 0x8000 +; SI-NEXT: s_and_b32 s13, s4, 0x7fff +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s20, 0x8000 +; SI-NEXT: s_bfe_u32 s4, s4, 0xf0010 +; SI-NEXT: s_or_b32 s4, s4, s13 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshr_b32 s19, s11, 16 +; SI-NEXT: s_or_b32 s4, s12, s4 +; SI-NEXT: s_and_b32 s11, s11, 0x8000 +; SI-NEXT: s_and_b32 s12, s3, 0x7fff +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s19, 0x8000 +; SI-NEXT: s_bfe_u32 s3, s3, 0xf0010 +; SI-NEXT: s_or_b32 s3, s3, s12 +; SI-NEXT: s_lshl_b32 s3, s3, 16 +; SI-NEXT: s_lshr_b32 s18, s10, 16 +; SI-NEXT: s_or_b32 s3, s11, s3 +; SI-NEXT: s_and_b32 s10, s10, 0x8000 +; SI-NEXT: s_and_b32 s11, s2, 0x7fff +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s18, 0x8000 +; SI-NEXT: s_bfe_u32 s2, s2, 0xf0010 +; SI-NEXT: s_or_b32 s2, s2, s11 +; SI-NEXT: s_lshl_b32 s2, s2, 16 +; SI-NEXT: s_lshr_b32 s17, s9, 16 +; SI-NEXT: s_or_b32 s2, s10, s2 +; SI-NEXT: s_and_b32 s9, s9, 0x8000 +; SI-NEXT: s_and_b32 s10, s1, 0x7fff +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s17, 0x8000 +; SI-NEXT: s_bfe_u32 s1, s1, 0xf0010 +; SI-NEXT: s_or_b32 s1, s1, s10 +; SI-NEXT: s_lshl_b32 s1, s1, 16 ; SI-NEXT: s_lshr_b32 s16, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 -; SI-NEXT: s_lshr_b32 s16, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 -; SI-NEXT: s_lshr_b32 s16, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 -; SI-NEXT: s_lshr_b32 s16, s1, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s16 -; SI-NEXT: s_lshr_b32 s16, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s16 -; SI-NEXT: s_lshr_b32 s16, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s16 -; SI-NEXT: s_lshr_b32 s16, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s16 -; SI-NEXT: s_lshr_b32 s16, s3, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 -; SI-NEXT: s_lshr_b32 s16, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s16 -; SI-NEXT: s_lshr_b32 s16, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 -; SI-NEXT: s_lshr_b32 s16, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s16 -; SI-NEXT: s_lshr_b32 s16, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s16 -; SI-NEXT: s_lshr_b32 s16, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 -; SI-NEXT: s_lshr_b32 s16, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s16 -; SI-NEXT: s_lshr_b32 s16, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s16 -; SI-NEXT: s_lshr_b32 s16, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s0 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v14, s0, v15, v14 -; SI-NEXT: v_bfi_b32 v18, s0, v19, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_bfi_b32 v10, s0, v11, v10 -; SI-NEXT: v_bfi_b32 v15, s0, v19, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 -; SI-NEXT: v_bfi_b32 v12, s0, v13, v12 -; SI-NEXT: v_bfi_b32 v8, s0, v9, v8 -; SI-NEXT: v_bfi_b32 v18, s0, v18, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_bfi_b32 v11, s0, v19, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s9 -; SI-NEXT: v_or_b32_e32 v12, v15, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, s11 -; SI-NEXT: v_or_b32_e32 v10, v18, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s3 -; SI-NEXT: v_or_b32_e32 v8, v11, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s2 -; SI-NEXT: v_bfi_b32 v6, s0, v7, v6 -; SI-NEXT: v_bfi_b32 v4, s0, v5, v4 -; SI-NEXT: v_bfi_b32 v2, s0, v3, v2 -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_bfi_b32 v7, s0, v18, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_bfi_b32 v5, s0, v11, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_bfi_b32 v3, s0, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s0, v17, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 -; SI-NEXT: v_readfirstlane_b32 s1, v2 -; SI-NEXT: v_readfirstlane_b32 s2, v4 -; SI-NEXT: v_readfirstlane_b32 s3, v6 -; SI-NEXT: v_readfirstlane_b32 s4, v8 -; SI-NEXT: v_readfirstlane_b32 s5, v10 -; SI-NEXT: v_readfirstlane_b32 s6, v12 -; SI-NEXT: v_readfirstlane_b32 s7, v14 +; SI-NEXT: s_or_b32 s1, s9, s1 +; SI-NEXT: s_and_b32 s8, s8, 0x8000 +; SI-NEXT: s_and_b32 s9, s0, 0x7fff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s16, 0x8000 +; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; SI-NEXT: s_or_b32 s0, s0, s9 +; SI-NEXT: s_lshl_b32 s0, s0, 16 +; SI-NEXT: s_or_b32 s0, s8, s0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v16f16: @@ -1886,18 +1789,14 @@ define <2 x half> @v_copysign_v2f16(<2 x half> %mag, <2 x half> %sign) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v2, s4, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v3, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v2f16: @@ -1927,23 +1826,18 @@ define <3 x half> @v_copysign_v3f16(<3 x half> %mag, <3 x half> %sign) { ; SI-LABEL: v_copysign_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_bfi_b32 v2, s4, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v4, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v3f16: @@ -1977,30 +1871,23 @@ define <4 x half> @v_copysign_v4f16(<4 x half> %mag, <4 x half> %sign) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_bfi_b32 v2, s4, v5, v4 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 -; SI-NEXT: v_bfi_b32 v3, s4, v7, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; SI-NEXT: v_and_b32_e32 v6, 0x7fff, v1 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_and_b32_e32 v6, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5 +; SI-NEXT: v_bfe_u32 v1, v1, 16, 15 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v4f16: @@ -2033,55 +1920,42 @@ define <8 x half> @v_copysign_v8f16(<8 x half> %mag, <8 x half> %sign) { ; SI-LABEL: v_copysign_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_and_b32_e32 v7, 0x8000, v7 +; SI-NEXT: v_and_b32_e32 v12, 0x7fff, v3 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_or_b32_e32 v7, v12, v7 +; SI-NEXT: v_and_b32_e32 v6, 0x8000, v6 +; SI-NEXT: v_and_b32_e32 v12, 0x7fff, v2 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 -; SI-NEXT: v_bfi_b32 v4, s4, v9, v8 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v5 -; SI-NEXT: v_bfi_b32 v5, s4, v11, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v6 -; SI-NEXT: v_bfi_b32 v6, s4, v13, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v7 -; SI-NEXT: v_bfi_b32 v7, s4, v15, v14 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_or_b32_e32 v6, v12, v6 +; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5 +; SI-NEXT: v_and_b32_e32 v12, 0x7fff, v1 +; SI-NEXT: v_or_b32_e32 v5, v12, v5 +; SI-NEXT: v_and_b32_e32 v12, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v11, 0x8000, v11 +; SI-NEXT: v_bfe_u32 v3, v3, 16, 15 +; SI-NEXT: v_and_b32_e32 v10, 0x8000, v10 +; SI-NEXT: v_bfe_u32 v2, v2, 16, 15 +; SI-NEXT: v_and_b32_e32 v9, 0x8000, v9 +; SI-NEXT: v_bfe_u32 v1, v1, 16, 15 +; SI-NEXT: v_and_b32_e32 v8, 0x8000, v8 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_or_b32_e32 v4, v12, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v8f16: @@ -2120,103 +1994,78 @@ define <16 x half> @v_copysign_v16f16(<16 x half> %mag, <16 x half> %sign) { ; SI-LABEL: v_copysign_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; SI-NEXT: v_bfi_b32 v15, s4, v19, v15 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: v_bfi_b32 v7, s4, v7, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; SI-NEXT: v_bfi_b32 v14, s4, v18, v14 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; SI-NEXT: v_bfi_b32 v6, s4, v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_bfi_b32 v13, s4, v19, v13 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_bfi_b32 v5, s4, v5, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_bfi_b32 v12, s4, v18, v12 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_bfi_b32 v4, s4, v4, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v11, s4, v19, v11 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_bfi_b32 v10, s4, v18, v10 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v8 -; SI-NEXT: v_bfi_b32 v8, s4, v17, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v9 -; SI-NEXT: v_bfi_b32 v9, s4, v19, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v16, 0x8000, v15 +; SI-NEXT: v_and_b32_e32 v17, 0x7fff, v7 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_and_b32_e32 v17, 0x8000, v14 +; SI-NEXT: v_and_b32_e32 v18, 0x7fff, v6 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_and_b32_e32 v18, 0x8000, v13 +; SI-NEXT: v_and_b32_e32 v19, 0x7fff, v5 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_and_b32_e32 v19, 0x8000, v12 +; SI-NEXT: v_and_b32_e32 v20, 0x7fff, v4 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_and_b32_e32 v20, 0x8000, v11 +; SI-NEXT: v_and_b32_e32 v21, 0x7fff, v3 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v21, 0x8000, v10 +; SI-NEXT: v_and_b32_e32 v22, 0x7fff, v2 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_and_b32_e32 v22, 0x8000, v9 +; SI-NEXT: v_and_b32_e32 v23, 0x7fff, v1 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_and_b32_e32 v23, 0x8000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v24, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v15, 0x8000, v15 +; SI-NEXT: v_bfe_u32 v7, v7, 16, 15 +; SI-NEXT: v_and_b32_e32 v14, 0x8000, v14 +; SI-NEXT: v_bfe_u32 v6, v6, 16, 15 +; SI-NEXT: v_and_b32_e32 v13, 0x8000, v13 +; SI-NEXT: v_bfe_u32 v5, v5, 16, 15 +; SI-NEXT: v_and_b32_e32 v12, 0x8000, v12 +; SI-NEXT: v_bfe_u32 v4, v4, 16, 15 +; SI-NEXT: v_and_b32_e32 v11, 0x8000, v11 +; SI-NEXT: v_bfe_u32 v3, v3, 16, 15 +; SI-NEXT: v_and_b32_e32 v10, 0x8000, v10 +; SI-NEXT: v_bfe_u32 v2, v2, 16, 15 +; SI-NEXT: v_and_b32_e32 v9, 0x8000, v9 +; SI-NEXT: v_bfe_u32 v1, v1, 16, 15 +; SI-NEXT: v_and_b32_e32 v8, 0x8000, v8 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_or_b32_e32 v7, v7, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v13 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 +; SI-NEXT: v_or_b32_e32 v3, v3, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 ; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v2, v8, v2 -; SI-NEXT: v_or_b32_e32 v3, v9, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v13 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v4, v8, v4 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v2, v21, v2 +; SI-NEXT: v_or_b32_e32 v3, v20, v3 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 +; SI-NEXT: v_or_b32_e32 v5, v18, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v7, v16, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v16f16: @@ -2267,201 +2116,178 @@ define <32 x half> @v_copysign_v32f32(<32 x half> %mag, <32 x half> %sign) { ; SI-LABEL: v_copysign_v32f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v14 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v58, 0x8000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v59, 0x7fff, v1 +; SI-NEXT: v_and_b32_e32 v17, 0x8000, v17 +; SI-NEXT: v_bfe_u32 v1, v1, 16, 15 +; SI-NEXT: v_and_b32_e32 v38, 0x8000, v27 +; SI-NEXT: v_and_b32_e32 v39, 0x7fff, v11 +; SI-NEXT: v_and_b32_e32 v48, 0x8000, v26 +; SI-NEXT: v_and_b32_e32 v49, 0x7fff, v10 +; SI-NEXT: v_and_b32_e32 v50, 0x8000, v25 +; SI-NEXT: v_and_b32_e32 v51, 0x7fff, v9 +; SI-NEXT: v_and_b32_e32 v40, 0x8000, v22 +; SI-NEXT: v_and_b32_e32 v41, 0x7fff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_and_b32_e32 v52, 0x8000, v24 +; SI-NEXT: v_and_b32_e32 v53, 0x7fff, v8 +; SI-NEXT: v_and_b32_e32 v54, 0x8000, v23 +; SI-NEXT: v_and_b32_e32 v55, 0x7fff, v7 +; SI-NEXT: v_and_b32_e32 v42, 0x8000, v21 +; SI-NEXT: v_and_b32_e32 v43, 0x7fff, v5 +; SI-NEXT: v_and_b32_e32 v44, 0x8000, v20 +; SI-NEXT: v_and_b32_e32 v45, 0x7fff, v4 +; SI-NEXT: v_and_b32_e32 v46, 0x8000, v19 +; SI-NEXT: v_and_b32_e32 v47, 0x7fff, v3 +; SI-NEXT: v_and_b32_e32 v56, 0x8000, v18 +; SI-NEXT: v_and_b32_e32 v57, 0x7fff, v2 +; SI-NEXT: v_or_b32_e32 v38, v39, v38 +; SI-NEXT: v_or_b32_e32 v39, v49, v48 +; SI-NEXT: v_or_b32_e32 v48, v51, v50 +; SI-NEXT: v_or_b32_e32 v51, v41, v40 +; SI-NEXT: v_or_b32_e32 v40, v59, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v49, v53, v52 +; SI-NEXT: v_or_b32_e32 v50, v55, v54 +; SI-NEXT: v_or_b32_e32 v52, v43, v42 +; SI-NEXT: v_or_b32_e32 v53, v45, v44 +; SI-NEXT: v_or_b32_e32 v54, v47, v46 +; SI-NEXT: v_or_b32_e32 v55, v57, v56 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: v_and_b32_e32 v32, 0x8000, v30 +; SI-NEXT: v_and_b32_e32 v33, 0x7fff, v14 +; SI-NEXT: v_and_b32_e32 v34, 0x8000, v29 +; SI-NEXT: v_and_b32_e32 v35, 0x7fff, v13 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_and_b32_e32 v33, 0x8000, v16 +; SI-NEXT: v_or_b32_e32 v34, v35, v34 +; SI-NEXT: v_and_b32_e32 v35, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v36, 0x8000, v28 +; SI-NEXT: v_and_b32_e32 v37, 0x7fff, v12 +; SI-NEXT: v_or_b32_e32 v33, v35, v33 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v31, s4, v32, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v13 -; SI-NEXT: v_bfi_b32 v14, s4, v14, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_bfi_b32 v30, s4, v32, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_bfi_b32 v13, s4, v13, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_bfi_b32 v29, s4, v32, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_bfi_b32 v12, s4, v12, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_bfi_b32 v28, s4, v32, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_bfi_b32 v11, s4, v11, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v26 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_bfi_b32 v27, s4, v32, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_bfi_b32 v10, s4, v10, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_bfi_b32 v32, s4, v32, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_bfi_b32 v9, s4, v9, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v34, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_bfi_b32 v26, s4, v26, v34 -; SI-NEXT: v_bfi_b32 v15, s4, v15, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_bfi_b32 v25, s4, v33, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: v_bfi_b32 v8, s4, v8, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_bfi_b32 v24, s4, v33, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_bfi_b32 v7, s4, v7, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_bfi_b32 v23, s4, v33, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_bfi_b32 v6, s4, v6, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_bfi_b32 v22, s4, v33, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_bfi_b32 v5, s4, v5, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_bfi_b32 v21, s4, v33, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_bfi_b32 v4, s4, v4, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_bfi_b32 v20, s4, v33, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_bfi_b32 v19, s4, v33, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v18, s4, v33, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v17, s4, v33, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v36, v37, v36 +; SI-NEXT: v_and_b32_e32 v37, 0x7fff, v15 +; SI-NEXT: v_bfe_u32 v15, v15, 16, 15 +; SI-NEXT: v_and_b32_e32 v30, 0x8000, v30 +; SI-NEXT: v_bfe_u32 v14, v14, 16, 15 +; SI-NEXT: v_and_b32_e32 v29, 0x8000, v29 +; SI-NEXT: v_bfe_u32 v13, v13, 16, 15 +; SI-NEXT: v_and_b32_e32 v28, 0x8000, v28 +; SI-NEXT: v_bfe_u32 v12, v12, 16, 15 +; SI-NEXT: v_and_b32_e32 v27, 0x8000, v27 +; SI-NEXT: v_bfe_u32 v11, v11, 16, 15 +; SI-NEXT: v_and_b32_e32 v26, 0x8000, v26 +; SI-NEXT: v_bfe_u32 v10, v10, 16, 15 +; SI-NEXT: v_and_b32_e32 v25, 0x8000, v25 +; SI-NEXT: v_bfe_u32 v9, v9, 16, 15 +; SI-NEXT: v_and_b32_e32 v24, 0x8000, v24 +; SI-NEXT: v_bfe_u32 v8, v8, 16, 15 +; SI-NEXT: v_and_b32_e32 v23, 0x8000, v23 +; SI-NEXT: v_bfe_u32 v7, v7, 16, 15 +; SI-NEXT: v_and_b32_e32 v22, 0x8000, v22 +; SI-NEXT: v_bfe_u32 v6, v6, 16, 15 +; SI-NEXT: v_and_b32_e32 v21, 0x8000, v21 +; SI-NEXT: v_bfe_u32 v5, v5, 16, 15 +; SI-NEXT: v_and_b32_e32 v20, 0x8000, v20 +; SI-NEXT: v_bfe_u32 v4, v4, 16, 15 +; SI-NEXT: v_and_b32_e32 v19, 0x8000, v19 +; SI-NEXT: v_bfe_u32 v3, v3, 16, 15 +; SI-NEXT: v_and_b32_e32 v18, 0x8000, v18 +; SI-NEXT: v_bfe_u32 v2, v2, 16, 15 +; SI-NEXT: v_and_b32_e32 v16, 0x8000, v16 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_or_b32_e32 v14, v14, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v29 +; SI-NEXT: v_or_b32_e32 v12, v12, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v27 +; SI-NEXT: v_or_b32_e32 v10, v10, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v25 +; SI-NEXT: v_or_b32_e32 v8, v8, v24 +; SI-NEXT: v_or_b32_e32 v7, v7, v23 +; SI-NEXT: v_or_b32_e32 v6, v6, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v21 +; SI-NEXT: v_or_b32_e32 v4, v4, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v0, v16, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v2, v16, v2 -; SI-NEXT: v_or_b32_e32 v3, v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v22 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v4, v16, v4 -; SI-NEXT: v_or_b32_e32 v5, v17, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_or_b32_e32 v6, v16, v6 -; SI-NEXT: v_or_b32_e32 v7, v17, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v32 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v35, 0x8000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v31, 0x8000, v31 +; SI-NEXT: v_or_b32_e32 v15, v15, v31 +; SI-NEXT: v_or_b32_e32 v35, v37, v35 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v8, v16, v8 -; SI-NEXT: v_or_b32_e32 v9, v17, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v28 -; SI-NEXT: v_or_b32_e32 v10, v16, v10 -; SI-NEXT: v_or_b32_e32 v11, v17, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 -; SI-NEXT: v_or_b32_e32 v12, v16, v12 -; SI-NEXT: v_or_b32_e32 v13, v17, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v26 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v2, v55, v2 +; SI-NEXT: v_or_b32_e32 v3, v54, v3 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 +; SI-NEXT: v_or_b32_e32 v5, v52, v5 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: v_or_b32_e32 v7, v50, v7 +; SI-NEXT: v_or_b32_e32 v8, v49, v8 +; SI-NEXT: v_or_b32_e32 v9, v48, v9 +; SI-NEXT: v_or_b32_e32 v10, v39, v10 +; SI-NEXT: v_or_b32_e32 v11, v38, v11 +; SI-NEXT: v_or_b32_e32 v12, v36, v12 +; SI-NEXT: v_or_b32_e32 v13, v34, v13 +; SI-NEXT: v_or_b32_e32 v14, v32, v14 +; SI-NEXT: v_or_b32_e32 v15, v35, v15 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v32f32: @@ -2730,12 +2556,10 @@ define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f64_sign_f16(double inreg %ma define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f32(half inreg %mag, float inreg %sign) { ; SI-LABEL: s_copysign_out_f16_mag_f16_sign_f32: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s1, s1, 0x80000000 +; SI-NEXT: s_and_b32 s0, s0, 0x7fff +; SI-NEXT: s_lshr_b32 s1, s1, 16 +; SI-NEXT: s_or_b32 s0, s0, s1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_out_f16_mag_f16_sign_f32: @@ -2781,12 +2605,10 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f32(half inreg %mag, float define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f64(half inreg %mag, double inreg %sign) { ; SI-LABEL: s_copysign_out_f16_mag_f16_sign_f64: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s1, s2, 0x80000000 +; SI-NEXT: s_and_b32 s0, s0, 0x7fff +; SI-NEXT: s_lshr_b32 s1, s1, 16 +; SI-NEXT: s_or_b32 s0, s0, s1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_out_f16_mag_f16_sign_f64: @@ -2833,11 +2655,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f32_sign_f16(float inreg %mag, half ; SI-LABEL: s_copysign_out_f16_mag_f32_sign_f16: ; SI: ; %bb.0: ; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s1 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_and_b32 s0, s1, 0xffff8000 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_or_b32_e32 v0, s0, v0 ; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: ; return to shader part epilog ; @@ -3061,18 +2881,15 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f32_sign_v2f16(<2 x float> %mag, < ; SI-LABEL: v_copysign_out_v2f16_mag_v2f32_sign_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3114,107 +2931,98 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f32_sign_v2f16(<2 x float> %mag, < ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] - %mag.trunc = fptrunc <2 x float> %mag to <2 x half> - %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %mag.trunc, <2 x half> %sign) - ret <2 x half> %out -} - -define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, <2 x half> %sign) { -; SI-LABEL: v_copysign_out_v2f16_mag_v2f64_sign_v2f16: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0x1ff, v1 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_and_b32_e32 v6, 0xffe, v6 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v7, v1, 20, 11 -; SI-NEXT: s_movk_i32 s4, 0x3f1 -; SI-NEXT: v_or_b32_e32 v0, v6, v0 -; SI-NEXT: v_sub_i32_e32 v8, vcc, s4, v7 -; SI-NEXT: v_or_b32_e32 v6, 0x1000, v0 -; SI-NEXT: v_med3_i32 v8, v8, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v9, v8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, v8, v9 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v6 -; SI-NEXT: s_movk_i32 s5, 0xfc10 -; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, s5, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 12, v7 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: v_or_b32_e32 v8, v0, v8 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 -; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; SI-NEXT: v_and_b32_e32 v8, 7, v6 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8 -; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 -; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; SI-NEXT: v_mov_b32_e32 v8, 0x7c00 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 -; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; SI-NEXT: v_mov_b32_e32 v9, 0x7e00 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_movk_i32 s6, 0x40f -; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %mag.trunc = fptrunc <2 x float> %mag to <2 x half> + %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %mag.trunc, <2 x half> %sign) + ret <2 x half> %out +} + +define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, <2 x half> %sign) { +; SI-LABEL: v_copysign_out_v2f16_mag_v2f64_sign_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0x1ff, v3 ; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_and_b32_e32 v6, 0xffe, v6 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v7, v3, 20, 11 +; SI-NEXT: v_bfe_u32 v3, v3, 20, 11 +; SI-NEXT: s_movk_i32 s4, 0x3f1 ; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_sub_i32_e32 v10, vcc, s4, v7 +; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v3 ; SI-NEXT: v_or_b32_e32 v6, 0x1000, v2 -; SI-NEXT: v_med3_i32 v10, v10, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v11, v10, v6 -; SI-NEXT: v_lshlrev_b32_e32 v10, v10, v11 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v10, v6 +; SI-NEXT: v_med3_i32 v7, v7, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v8, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v6 +; SI-NEXT: s_movk_i32 s5, 0xfc10 ; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, s5, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 12, v7 -; SI-NEXT: v_or_b32_e32 v6, v11, v6 -; SI-NEXT: v_or_b32_e32 v10, v2, v10 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 -; SI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; SI-NEXT: v_and_b32_e32 v10, 7, v6 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10 -; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10 -; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v3, vcc, s5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v3 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_or_b32_e32 v7, v2, v7 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; SI-NEXT: v_and_b32_e32 v7, 7, v6 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 +; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 -; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; SI-NEXT: v_mov_b32_e32 v7, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; SI-NEXT: v_mov_b32_e32 v8, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_movk_i32 s6, 0x40f +; SI-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3 ; SI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; SI-NEXT: v_and_b32_e32 v6, 0x1ff, v1 +; SI-NEXT: v_or_b32_e32 v0, v6, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_bfe_u32 v1, v1, 20, 11 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_sub_i32_e32 v6, vcc, s4, v1 +; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0 +; SI-NEXT: v_med3_i32 v6, v6, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v9, v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, v6, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v3 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 12, v1 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v6, v0, v6 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 +; SI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; SI-NEXT: v_and_b32_e32 v6, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 +; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1 +; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v1, s4, v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3573,20 +3381,16 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f16_sign_v2f32(<2 x half> %mag, <2 ; SI-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 -; SI-NEXT: v_bfi_b32 v1, s4, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v3, 0x7fff, v0 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f32: @@ -3636,16 +3440,16 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f16_sign_v2f64(<2 x half> %mag, <2 ; SI-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v2, 0x80000000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0x80000000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f64: @@ -3902,16 +3706,13 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f32_sign_v2f16(<2 x float> inre ; SI: ; %bb.0: ; SI-NEXT: v_cvt_f16_f32_e32 v1, s1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 -; SI-NEXT: s_lshr_b32 s3, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v1, s0, v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_lshr_b32 s0, s2, 16 +; SI-NEXT: s_and_b32 s0, s0, 0x8000 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; SI-NEXT: s_and_b32 s1, s2, 0x8000 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_or_b32_e32 v1, s0, v1 +; SI-NEXT: v_or_b32_e32 v0, s1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_readfirstlane_b32 s0, v0 @@ -3968,100 +3769,90 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f32_sign_v2f16(<2 x float> inre define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inreg %mag, <2 x half> inreg %sign) { ; SI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; SI: ; %bb.0: +; SI-NEXT: s_lshr_b32 s6, s3, 8 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: s_lshr_b32 s5, s1, 8 -; SI-NEXT: s_and_b32 s5, s5, 0xffe -; SI-NEXT: s_and_b32 s6, s1, 0x1ff -; SI-NEXT: s_or_b32 s0, s6, s0 +; SI-NEXT: s_and_b32 s8, s6, 0xffe +; SI-NEXT: s_and_b32 s6, s3, 0x1ff +; SI-NEXT: s_or_b32 s2, s6, s2 ; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] -; SI-NEXT: s_bfe_u32 s6, s1, 0xb0014 -; SI-NEXT: v_readfirstlane_b32 s0, v1 -; SI-NEXT: s_sub_i32 s7, 0x3f1, s6 -; SI-NEXT: s_or_b32 s0, s5, s0 -; SI-NEXT: v_med3_i32 v1, s7, 0, 13 -; SI-NEXT: s_or_b32 s5, s0, 0x1000 -; SI-NEXT: v_readfirstlane_b32 s7, v1 -; SI-NEXT: s_lshr_b32 s8, s5, s7 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] +; SI-NEXT: s_bfe_u32 s3, s3, 0xb0014 +; SI-NEXT: v_readfirstlane_b32 s2, v0 +; SI-NEXT: s_sub_i32 s7, 0x3f1, s3 +; SI-NEXT: s_or_b32 s2, s8, s2 +; SI-NEXT: v_med3_i32 v0, s7, 0, 13 +; SI-NEXT: s_or_b32 s6, s2, 0x1000 +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: s_lshr_b32 s8, s6, s7 ; SI-NEXT: s_lshl_b32 s7, s8, s7 -; SI-NEXT: s_cmp_lg_u32 s7, s5 -; SI-NEXT: s_cselect_b32 s5, 1, 0 -; SI-NEXT: s_addk_i32 s6, 0xfc10 -; SI-NEXT: s_lshl_b32 s7, s6, 12 -; SI-NEXT: s_or_b32 s5, s8, s5 -; SI-NEXT: s_or_b32 s7, s0, s7 -; SI-NEXT: s_cmp_lt_i32 s6, 1 -; SI-NEXT: s_cselect_b32 s5, s5, s7 -; SI-NEXT: s_and_b32 s7, s5, 7 +; SI-NEXT: s_cmp_lg_u32 s7, s6 +; SI-NEXT: s_cselect_b32 s6, 1, 0 +; SI-NEXT: s_addk_i32 s3, 0xfc10 +; SI-NEXT: s_lshl_b32 s7, s3, 12 +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_or_b32 s7, s2, s7 +; SI-NEXT: s_cmp_lt_i32 s3, 1 +; SI-NEXT: s_cselect_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s6, 7 ; SI-NEXT: s_cmp_gt_i32 s7, 5 ; SI-NEXT: s_cselect_b32 s8, 1, 0 ; SI-NEXT: s_cmp_eq_u32 s7, 3 ; SI-NEXT: s_cselect_b32 s7, 1, 0 ; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_lshr_b32 s5, s5, 2 -; SI-NEXT: s_add_i32 s5, s5, s7 -; SI-NEXT: s_cmp_lt_i32 s6, 31 -; SI-NEXT: s_cselect_b32 s5, s5, 0x7c00 -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_lshr_b32 s6, s6, 2 +; SI-NEXT: s_add_i32 s6, s6, s7 +; SI-NEXT: s_cmp_lt_i32 s3, 31 +; SI-NEXT: s_cselect_b32 s6, s6, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: s_movk_i32 s7, 0x7e00 -; SI-NEXT: s_cselect_b32 s0, s7, 0x7c00 -; SI-NEXT: s_cmpk_eq_i32 s6, 0x40f -; SI-NEXT: s_cselect_b32 s0, s0, s5 -; SI-NEXT: s_lshr_b32 s1, s1, 16 -; SI-NEXT: s_and_b32 s1, s1, 0x8000 -; SI-NEXT: s_or_b32 s5, s1, s0 -; SI-NEXT: s_lshr_b32 s0, s3, 8 -; SI-NEXT: s_and_b32 s6, s0, 0xffe -; SI-NEXT: s_and_b32 s0, s3, 0x1ff -; SI-NEXT: s_or_b32 s0, s0, s2 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; SI-NEXT: v_readfirstlane_b32 s0, v1 -; SI-NEXT: s_bfe_u32 s2, s3, 0xb0014 -; SI-NEXT: s_or_b32 s0, s6, s0 -; SI-NEXT: s_sub_i32 s6, 0x3f1, s2 -; SI-NEXT: v_med3_i32 v1, s6, 0, 13 -; SI-NEXT: s_or_b32 s1, s0, 0x1000 -; SI-NEXT: v_readfirstlane_b32 s6, v1 -; SI-NEXT: s_lshr_b32 s8, s1, s6 -; SI-NEXT: s_lshl_b32 s6, s8, s6 -; SI-NEXT: s_cmp_lg_u32 s6, s1 -; SI-NEXT: s_cselect_b32 s1, 1, 0 -; SI-NEXT: s_addk_i32 s2, 0xfc10 -; SI-NEXT: s_lshl_b32 s6, s2, 12 -; SI-NEXT: s_or_b32 s1, s8, s1 -; SI-NEXT: s_or_b32 s6, s0, s6 -; SI-NEXT: s_cmp_lt_i32 s2, 1 -; SI-NEXT: s_cselect_b32 s1, s1, s6 -; SI-NEXT: s_and_b32 s6, s1, 7 -; SI-NEXT: s_cmp_gt_i32 s6, 5 +; SI-NEXT: s_cselect_b32 s2, s7, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s3, 0x40f +; SI-NEXT: s_cselect_b32 s6, s2, s6 +; SI-NEXT: s_lshr_b32 s2, s1, 8 +; SI-NEXT: s_and_b32 s8, s2, 0xffe +; SI-NEXT: s_and_b32 s2, s1, 0x1ff +; SI-NEXT: s_or_b32 s0, s2, s0 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; SI-NEXT: s_bfe_u32 s1, s1, 0xb0014 +; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_sub_i32 s3, 0x3f1, s1 +; SI-NEXT: s_or_b32 s0, s8, s0 +; SI-NEXT: v_med3_i32 v0, s3, 0, 13 +; SI-NEXT: s_or_b32 s2, s0, 0x1000 +; SI-NEXT: v_readfirstlane_b32 s3, v0 +; SI-NEXT: s_lshr_b32 s8, s2, s3 +; SI-NEXT: s_lshl_b32 s3, s8, s3 +; SI-NEXT: s_cmp_lg_u32 s3, s2 +; SI-NEXT: s_cselect_b32 s2, 1, 0 +; SI-NEXT: s_addk_i32 s1, 0xfc10 +; SI-NEXT: s_lshl_b32 s3, s1, 12 +; SI-NEXT: s_or_b32 s2, s8, s2 +; SI-NEXT: s_or_b32 s3, s0, s3 +; SI-NEXT: s_cmp_lt_i32 s1, 1 +; SI-NEXT: s_cselect_b32 s2, s2, s3 +; SI-NEXT: s_and_b32 s3, s2, 7 +; SI-NEXT: s_cmp_gt_i32 s3, 5 ; SI-NEXT: s_cselect_b32 s8, 1, 0 -; SI-NEXT: s_cmp_eq_u32 s6, 3 -; SI-NEXT: s_cselect_b32 s6, 1, 0 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: s_lshr_b32 s1, s1, 2 -; SI-NEXT: s_add_i32 s1, s1, s6 -; SI-NEXT: s_cmp_lt_i32 s2, 31 -; SI-NEXT: s_cselect_b32 s1, s1, 0x7c00 +; SI-NEXT: s_cmp_eq_u32 s3, 3 +; SI-NEXT: s_cselect_b32 s3, 1, 0 +; SI-NEXT: s_or_b32 s3, s3, s8 +; SI-NEXT: s_lshr_b32 s2, s2, 2 +; SI-NEXT: s_add_i32 s2, s2, s3 +; SI-NEXT: s_cmp_lt_i32 s1, 31 +; SI-NEXT: s_cselect_b32 s2, s2, 0x7c00 ; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cselect_b32 s0, s7, 0x7c00 -; SI-NEXT: s_cmpk_eq_i32 s2, 0x40f -; SI-NEXT: s_cselect_b32 s0, s0, s1 -; SI-NEXT: s_lshr_b32 s1, s3, 16 -; SI-NEXT: s_and_b32 s1, s1, 0x8000 -; SI-NEXT: s_or_b32 s0, s1, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s0, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_cmpk_eq_i32 s1, 0x40f +; SI-NEXT: s_cselect_b32 s0, s0, s2 +; SI-NEXT: s_and_b32 s0, s0, 0x7fff +; SI-NEXT: s_and_b32 s1, s4, 0x8000 +; SI-NEXT: s_or_b32 s0, s0, s1 +; SI-NEXT: s_and_b32 s1, s6, 0x7fff +; SI-NEXT: s_and_b32 s2, s5, 0x8000 +; SI-NEXT: s_or_b32 s1, s1, s2 +; SI-NEXT: s_lshl_b32 s1, s1, 16 +; SI-NEXT: s_or_b32 s0, s0, s1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: @@ -4358,16 +4149,12 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f32(<2 x half> inreg ; SI: ; %bb.0: ; SI-NEXT: v_cvt_f16_f32_e32 v1, s2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s1 -; SI-NEXT: s_lshr_b32 s3, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v1, s0, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v0, s0, v3, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_and_b32 s1, s0, 0x7fff +; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_or_b32_e32 v1, s0, v1 +; SI-NEXT: v_or_b32_e32 v0, s1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_readfirstlane_b32 s0, v0 @@ -4424,19 +4211,16 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f32(<2 x half> inreg define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f64(<2 x half> inreg %mag, <2 x double> inreg %sign) { ; SI-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_lshr_b32 s1, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s1 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v2 -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: v_bfi_b32 v1, s0, v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s2, s2, 0x80000000 +; SI-NEXT: s_and_b32 s1, s0, 0x7fff +; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_or_b32 s1, s1, s2 +; SI-NEXT: s_and_b32 s2, s4, 0x80000000 +; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; SI-NEXT: s_or_b32 s0, s0, s2 +; SI-NEXT: s_lshl_b32 s0, s0, 16 +; SI-NEXT: s_or_b32 s0, s1, s0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64: @@ -4618,14 +4402,14 @@ define <3 x double> @v_copysign_out_v3f64_mag_v3f64_sign_v3f16(<3 x double> %mag ; SI-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v8 ; SI-NEXT: v_bfi_b32 v5, s4, v5, v7 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v8 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16: @@ -4687,24 +4471,20 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f32_sign_v3f16(<3 x float> %mag, < ; SI-LABEL: v_copysign_out_v3f16_mag_v3f32_sign_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v5 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s4, v2, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v4 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v3 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_and_b32_e32 v3, 0x7fff, v5 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4764,139 +4544,126 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; SI-LABEL: v_copysign_out_v3f16_mag_v3f64_sign_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0x1ff, v3 -; SI-NEXT: v_or_b32_e32 v2, v10, v2 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; SI-NEXT: v_and_b32_e32 v9, 0x1ff, v3 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_and_b32_e32 v7, 0xffe, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffe, v8 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v10, v3, 20, 11 +; SI-NEXT: v_bfe_u32 v3, v3, 20, 11 ; SI-NEXT: s_movk_i32 s4, 0x3f1 -; SI-NEXT: v_or_b32_e32 v2, v7, v2 -; SI-NEXT: v_sub_i32_e32 v11, vcc, s4, v10 -; SI-NEXT: v_or_b32_e32 v7, 0x1000, v2 -; SI-NEXT: v_med3_i32 v11, v11, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v12, v11, v7 -; SI-NEXT: v_lshlrev_b32_e32 v11, v11, v12 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v11, v7 +; SI-NEXT: v_or_b32_e32 v2, v8, v2 +; SI-NEXT: v_sub_i32_e32 v9, vcc, s4, v3 +; SI-NEXT: v_or_b32_e32 v8, 0x1000, v2 +; SI-NEXT: v_med3_i32 v9, v9, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v10, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, v9, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v9, v8 ; SI-NEXT: s_movk_i32 s5, 0xfc10 -; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v10, vcc, s5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 12, v10 -; SI-NEXT: v_or_b32_e32 v7, v12, v7 -; SI-NEXT: v_or_b32_e32 v11, v2, v11 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v10 -; SI-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc -; SI-NEXT: v_and_b32_e32 v11, 7, v7 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11 -; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11 -; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_lshrrev_b32_e32 v7, 2, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; SI-NEXT: v_mov_b32_e32 v11, 0x7c00 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v10 -; SI-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc -; SI-NEXT: v_mov_b32_e32 v12, 0x7e00 +; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, s5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 12, v3 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_or_b32_e32 v9, v2, v9 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; SI-NEXT: v_and_b32_e32 v9, 7, v8 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v9 +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 +; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 2, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; SI-NEXT: v_mov_b32_e32 v9, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; SI-NEXT: v_mov_b32_e32 v10, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_movk_i32 s6, 0x40f -; SI-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v10 -; SI-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v7, 0x1ff, v1 -; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; SI-NEXT: v_and_b32_e32 v8, 0x1ff, v1 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 ; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v7, v1, 20, 11 +; SI-NEXT: v_bfe_u32 v1, v1, 20, 11 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_sub_i32_e32 v10, vcc, s4, v7 +; SI-NEXT: v_sub_i32_e32 v8, vcc, s4, v1 ; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0 -; SI-NEXT: v_med3_i32 v10, v10, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v13, v10, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, v10, v13 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v10, v3 +; SI-NEXT: v_med3_i32 v8, v8, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v11, v8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, v8, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v3 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, s5, v7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 12, v7 -; SI-NEXT: v_or_b32_e32 v3, v13, v3 -; SI-NEXT: v_or_b32_e32 v10, v0, v10 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 -; SI-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; SI-NEXT: v_and_b32_e32 v10, 7, v3 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10 -; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10 -; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 12, v1 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 +; SI-NEXT: v_or_b32_e32 v8, v0, v8 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 +; SI-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; SI-NEXT: v_and_b32_e32 v8, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8 +; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 +; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v8, v8, v11 ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 -; SI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1 +; SI-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; SI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v3, 0x1ff, v5 -; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffe, v1 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v4, v5, 20, 11 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v4 +; SI-NEXT: v_sub_i32_e32 v5, vcc, s4, v4 ; SI-NEXT: v_or_b32_e32 v3, 0x1000, v1 -; SI-NEXT: v_med3_i32 v7, v7, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v10, v7, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v10 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v3 +; SI-NEXT: v_med3_i32 v5, v5, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v8, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, v5, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v3 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, s5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v4 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: v_or_b32_e32 v7, v1, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 12, v4 +; SI-NEXT: v_or_b32_e32 v3, v8, v3 +; SI-NEXT: v_or_b32_e32 v5, v1, v5 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 -; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; SI-NEXT: v_and_b32_e32 v7, 7, v3 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 -; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 -; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; SI-NEXT: v_and_b32_e32 v5, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v5, v5, v8 ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 -; SI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v4 ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v8 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5394,25 +5161,20 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f16_sign_v3f32(<3 x half> %mag, <3 ; SI-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_bfi_b32 v2, s4, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_and_b32_e32 v4, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f32: @@ -5471,19 +5233,20 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f16_sign_v3f64(<3 x half> %mag, <3 ; SI-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v2, 0x80000000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0x80000000, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0x80000000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f64: @@ -5554,17 +5317,17 @@ define <4 x float> @v_copysign_out_v4f32_mag_v4f16_sign_v4f32(<4 x half> %mag, < ; SI-LABEL: v_copysign_out_v4f32_mag_v4f16_sign_v4f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 ; SI-NEXT: v_bfi_b32 v2, s4, v1, v4 -; SI-NEXT: v_bfi_b32 v1, s4, v7, v3 -; SI-NEXT: v_bfi_b32 v3, s4, v6, v5 +; SI-NEXT: v_bfi_b32 v1, s4, v6, v3 +; SI-NEXT: v_bfi_b32 v3, s4, v7, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v4f32_mag_v4f16_sign_v4f32: @@ -5636,17 +5399,17 @@ define <4 x float> @v_copysign_out_v4f32_mag_v4f32_sign_v4f16(<4 x float> %mag, ; SI-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v7 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v6 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4f16: @@ -5721,10 +5484,10 @@ define <4 x double> @v_copysign_out_v4f64_mag_v4f64_sign_v4f16(<4 x double> %mag ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v8 ; SI-NEXT: v_bfi_b32 v5, s4, v5, v9 @@ -5802,33 +5565,28 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f32_sign_v4f16(<4 x float> %mag, < ; SI-LABEL: v_copysign_out_v4f16_mag_v4f32_sign_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_and_b32_e32 v5, 0x8000, v7 +; SI-NEXT: v_and_b32_e32 v3, 0x7fff, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v6 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v4f16_mag_v4f32_sign_v4f16: @@ -5900,178 +5658,161 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_and_b32_e32 v12, 0xffe, v12 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v13, v3, 20, 11 +; SI-NEXT: v_bfe_u32 v3, v3, 20, 11 ; SI-NEXT: s_movk_i32 s4, 0x3f1 ; SI-NEXT: v_or_b32_e32 v2, v12, v2 -; SI-NEXT: v_sub_i32_e32 v14, vcc, s4, v13 +; SI-NEXT: v_sub_i32_e32 v13, vcc, s4, v3 ; SI-NEXT: v_or_b32_e32 v12, 0x1000, v2 -; SI-NEXT: v_med3_i32 v14, v14, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v15, v14, v12 -; SI-NEXT: v_lshlrev_b32_e32 v14, v14, v15 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v14, v12 +; SI-NEXT: v_med3_i32 v13, v13, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v14, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, v13, v14 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v13, v12 ; SI-NEXT: s_movk_i32 s5, 0xfc10 ; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, s5, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 12, v13 -; SI-NEXT: v_or_b32_e32 v12, v15, v12 -; SI-NEXT: v_or_b32_e32 v14, v2, v14 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v13 -; SI-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc -; SI-NEXT: v_and_b32_e32 v14, 7, v12 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v14 -; SI-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v3, vcc, s5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 12, v3 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_or_b32_e32 v13, v2, v13 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; SI-NEXT: v_and_b32_e32 v13, 7, v12 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v13 ; SI-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v13 +; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_lshrrev_b32_e32 v12, 2, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; SI-NEXT: v_mov_b32_e32 v14, 0x7c00 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v13 -; SI-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc -; SI-NEXT: v_mov_b32_e32 v15, 0x7e00 +; SI-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; SI-NEXT: v_mov_b32_e32 v13, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; SI-NEXT: v_mov_b32_e32 v14, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_movk_i32 s6, 0x40f -; SI-NEXT: v_cndmask_b32_e32 v2, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v13 +; SI-NEXT: v_cndmask_b32_e32 v2, v13, v14, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3 ; SI-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v12, 0x1ff, v1 -; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 ; SI-NEXT: v_or_b32_e32 v0, v12, v0 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v12, v1, 20, 11 +; SI-NEXT: v_bfe_u32 v1, v1, 20, 11 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_sub_i32_e32 v13, vcc, s4, v12 +; SI-NEXT: v_sub_i32_e32 v12, vcc, s4, v1 ; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0 -; SI-NEXT: v_med3_i32 v13, v13, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v16, v13, v3 -; SI-NEXT: v_lshlrev_b32_e32 v13, v13, v16 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v13, v3 +; SI-NEXT: v_med3_i32 v12, v12, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v15, v12, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, v12, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v12, v3 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v12, vcc, s5, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 12, v12 -; SI-NEXT: v_or_b32_e32 v3, v16, v3 -; SI-NEXT: v_or_b32_e32 v13, v0, v13 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v12 -; SI-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc -; SI-NEXT: v_and_b32_e32 v13, 7, v3 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v13 -; SI-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v13 -; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v13, v13, v16 +; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v1 +; SI-NEXT: v_lshlrev_b32_e32 v12, 12, v1 +; SI-NEXT: v_or_b32_e32 v3, v15, v3 +; SI-NEXT: v_or_b32_e32 v12, v0, v12 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 +; SI-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; SI-NEXT: v_and_b32_e32 v12, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v12 +; SI-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v12, v12, v15 ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v13 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v12 -; SI-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1 +; SI-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v12 +; SI-NEXT: v_cndmask_b32_e32 v0, v13, v14, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v3, 0x1ff, v7 -; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v7 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffe, v1 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v6, v7, 20, 11 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_sub_i32_e32 v12, vcc, s4, v6 +; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v6 ; SI-NEXT: v_or_b32_e32 v3, 0x1000, v1 -; SI-NEXT: v_med3_i32 v12, v12, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v13, v12, v3 -; SI-NEXT: v_lshlrev_b32_e32 v12, v12, v13 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v12, v3 +; SI-NEXT: v_med3_i32 v7, v7, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v12, v7, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v3 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; SI-NEXT: v_add_i32_e32 v6, vcc, s5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 12, v6 -; SI-NEXT: v_or_b32_e32 v3, v13, v3 -; SI-NEXT: v_or_b32_e32 v12, v1, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v6 +; SI-NEXT: v_or_b32_e32 v3, v12, v3 +; SI-NEXT: v_or_b32_e32 v7, v1, v7 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 -; SI-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; SI-NEXT: v_and_b32_e32 v12, 7, v3 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v12 -; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; SI-NEXT: v_and_b32_e32 v7, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 ; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v7, v7, v12 ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 -; SI-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v13, v14, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 -; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 ; SI-NEXT: v_and_b32_e32 v6, 0x1ff, v5 -; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3 ; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 +; SI-NEXT: v_bfe_u32 v5, v5, 20, 11 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v6 +; SI-NEXT: v_sub_i32_e32 v6, vcc, s4, v5 ; SI-NEXT: v_or_b32_e32 v4, 0x1000, v3 -; SI-NEXT: v_med3_i32 v7, v7, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v12, v7, v4 -; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v12 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v4 +; SI-NEXT: v_med3_i32 v6, v6, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v7, v6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, v6, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v4 ; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, s5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v6 -; SI-NEXT: v_or_b32_e32 v4, v12, v4 -; SI-NEXT: v_or_b32_e32 v7, v3, v7 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 -; SI-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; SI-NEXT: v_and_b32_e32 v7, 7, v4 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 -; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v5, vcc, s5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 12, v5 +; SI-NEXT: v_or_b32_e32 v4, v7, v4 +; SI-NEXT: v_or_b32_e32 v6, v3, v6 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 +; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; SI-NEXT: v_and_b32_e32 v6, 7, v4 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 ; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_lshrrev_b32_e32 v4, 2, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 -; SI-NEXT: v_cndmask_b32_e32 v4, v14, v4, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 +; SI-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 +; SI-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 ; SI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v11 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v10 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v9 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0x7fff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6724,33 +6465,26 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f16_sign_v4f32(<4 x half> %mag, <4 ; SI-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_bfi_b32 v2, s4, v7, v5 -; SI-NEXT: v_bfi_b32 v3, s4, v6, v3 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v6, 0x7fff, v1 +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v6, 0x7fff, v0 +; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5 +; SI-NEXT: v_bfe_u32 v1, v1, 16, 15 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f32: @@ -6816,25 +6550,26 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f16_sign_v4f64(<4 x half> %mag, <4 ; SI-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 -; SI-NEXT: v_bfi_b32 v3, s4, v4, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v4, 0x80000000, v7 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0x80000000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0x7fff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_and_b32_e32 v4, 0x80000000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_bfe_u32 v1, v1, 16, 15 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_and_b32_e32 v4, 0x80000000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_bfe_u32 v0, v0, 16, 15 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f64: @@ -7268,7 +7003,7 @@ define half @v_copysign_f16_0_f64(double %sign) { ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0x7fff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_f16_0_f64: @@ -7306,16 +7041,7 @@ define half @v_copysign_f16_0_f64(double %sign) { define amdgpu_ps i32 @s_copysign_v2f16_0_v2f16(<2 x half> inreg %sign) { ; SI-LABEL: s_copysign_v2f16_0_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_lshr_b32 s1, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s0, s0, 0x80008000 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v2f16_0_v2f16: @@ -7344,15 +7070,7 @@ define <2 x half> @v_copysign_v2f16_0_v2f16(<2 x half> %sign) { ; SI-LABEL: v_copysign_v2f16_0_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0x80008000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v2f16_0_v2f16: @@ -7380,16 +7098,12 @@ define <2 x half> @v_copysign_v2f16_0_v2f16(<2 x half> %sign) { define amdgpu_ps i32 @s_copysign_v2f16_0_v2f32(<2 x float> inreg %sign) { ; SI-LABEL: s_copysign_v2f16_0_v2f32: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: ; return to shader part epilog ; @@ -7444,12 +7158,8 @@ define <2 x half> @v_copysign_v2f16_0_v2bf32(<2 x float> %sign) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7499,13 +7209,10 @@ define <2 x half> @v_copysign_v2f16_0_v2bf32(<2 x float> %sign) { define amdgpu_ps i32 @s_copysign_v2f16_0_v2f64(<2 x double> inreg %sign) { ; SI-LABEL: s_copysign_v2f16_0_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_and_b32 s0, 0x80000000, s3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 -; SI-NEXT: s_and_b32 s0, 0x80000000, s1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s0, s1, 0x80000000 +; SI-NEXT: s_and_b32 s1, s3, 0x80000000 +; SI-NEXT: s_lshr_b32 s1, s1, 16 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v2f16_0_v2f64: @@ -7541,12 +7248,10 @@ define <2 x half> @v_copysign_v2f16_0_v2bf64(<2 x double> %sign) { ; SI-LABEL: v_copysign_v2f16_0_v2bf64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_v2f16_0_v2bf64: diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll index 1779c45203f47..fd5c47d36a752 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -2143,7 +2143,6 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; SI-NEXT: v_rcp_f32_e32 v3, v2 ; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 @@ -2155,21 +2154,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; SI-NEXT: v_fma_f32 v5, v6, v3, v5 ; SI-NEXT: v_fma_f32 v2, -v2, v5, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_scale_f32 v4, s[4:5], v0, v0, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; SI-NEXT: v_rcp_f32_e32 v5, v4 ; SI-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 -; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; SI-NEXT: v_rcp_f32_e32 v3, v2 +; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v2, -v4, v5, 1.0 -; SI-NEXT: v_fma_f32 v2, v2, v5, v5 -; SI-NEXT: v_mul_f32_e32 v5, v3, v2 -; SI-NEXT: v_fma_f32 v6, -v4, v5, v3 -; SI-NEXT: v_fma_f32 v5, v6, v2, v5 -; SI-NEXT: v_fma_f32 v3, -v4, v5, v3 +; SI-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; SI-NEXT: v_fma_f32 v3, v5, v3, v3 +; SI-NEXT: v_mul_f32_e32 v5, v4, v3 +; SI-NEXT: v_fma_f32 v6, -v2, v5, v4 +; SI-NEXT: v_fma_f32 v5, v6, v3, v5 +; SI-NEXT: v_fma_f32 v2, -v2, v5, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v2, v3, v2, v5 +; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; SI-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -2351,7 +2351,6 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 ; SI-NEXT: v_rcp_f32_e32 v3, v2 ; SI-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 @@ -2363,21 +2362,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; SI-NEXT: v_fma_f32 v5, v6, v3, v5 ; SI-NEXT: v_fma_f32 v2, -v2, v5, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_scale_f32 v4, s[4:5], v0, v0, -1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; SI-NEXT: v_rcp_f32_e32 v5, v4 ; SI-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 -; SI-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 +; SI-NEXT: v_rcp_f32_e32 v3, v2 +; SI-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v2, -v4, v5, 1.0 -; SI-NEXT: v_fma_f32 v2, v2, v5, v5 -; SI-NEXT: v_mul_f32_e32 v5, v3, v2 -; SI-NEXT: v_fma_f32 v6, -v4, v5, v3 -; SI-NEXT: v_fma_f32 v5, v6, v2, v5 -; SI-NEXT: v_fma_f32 v3, -v4, v5, v3 +; SI-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; SI-NEXT: v_fma_f32 v3, v5, v3, v3 +; SI-NEXT: v_mul_f32_e32 v5, v4, v3 +; SI-NEXT: v_fma_f32 v6, -v2, v5, v4 +; SI-NEXT: v_fma_f32 v5, v6, v3, v5 +; SI-NEXT: v_fma_f32 v2, -v2, v5, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v2, v3, v2, v5 +; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; SI-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index dcf0519dee355..262b6c53fa2f8 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -8690,24 +8690,22 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8718,7 +8716,7 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: s_cbranch_execnz .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result @@ -9071,24 +9069,22 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9099,7 +9095,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9454,24 +9450,22 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9482,7 +9476,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9814,23 +9808,21 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10183,14 +10175,12 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10554,14 +10544,12 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10833,10 +10821,8 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11117,8 +11103,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11494,24 +11478,22 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11522,7 +11504,7 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fadd ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -11869,14 +11851,12 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16430,49 +16410,39 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result @@ -16626,49 +16596,39 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -16836,49 +16796,39 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -17023,41 +16973,32 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17211,41 +17152,32 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17416,41 +17348,32 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB61_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17612,49 +17535,39 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB62_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -17808,41 +17721,32 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB63_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17997,49 +17901,39 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB64_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret <2 x half> %result @@ -18183,41 +18077,32 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB65_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18371,49 +18256,39 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB66_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x half> %result @@ -18557,41 +18432,32 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB67_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index a412a4eebe7ea..3919ba4e2b1c2 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -6350,24 +6350,22 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6378,7 +6376,7 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result @@ -6756,24 +6754,22 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6784,7 +6780,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7164,24 +7160,22 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7192,7 +7186,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7544,23 +7538,21 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7938,14 +7930,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8334,14 +8324,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8644,8 +8632,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8935,10 +8921,8 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9339,24 +9323,22 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9367,7 +9349,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmax ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9739,14 +9721,12 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14358,49 +14338,39 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result @@ -14602,49 +14572,39 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14862,49 +14822,39 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15096,41 +15046,32 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15331,41 +15272,32 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15586,41 +15518,32 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15830,49 +15753,39 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -16073,41 +15986,32 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index c05d76a63a1d4..858ff79ade52f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -6350,24 +6350,22 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6378,7 +6376,7 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result @@ -6756,24 +6754,22 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6784,7 +6780,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7164,24 +7160,22 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7192,7 +7186,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7544,23 +7538,21 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7938,14 +7930,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8334,14 +8324,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8644,8 +8632,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8935,10 +8921,8 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9339,24 +9323,22 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9367,7 +9349,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmin ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9739,14 +9721,12 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14358,49 +14338,39 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result @@ -14602,49 +14572,39 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14862,49 +14822,39 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15096,41 +15046,32 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15331,41 +15272,32 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15586,41 +15518,32 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15830,49 +15753,39 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -16073,41 +15986,32 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index d7c913cafd7d9..0fb799ea66461 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -6137,24 +6137,22 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6165,7 +6163,7 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr %ptr, half %val syncscope("agent") seq_cst ret half %result @@ -6518,24 +6516,22 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6546,7 +6542,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst @@ -6901,24 +6897,22 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6929,7 +6923,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst @@ -7261,23 +7255,21 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7630,14 +7622,12 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8001,14 +7991,12 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8291,8 +8279,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX7-NEXT: flat_load_dword v0, v[3:4] ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8562,10 +8548,8 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8941,24 +8925,22 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 ; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8969,7 +8951,7 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fsub ptr %gep, half %val seq_cst @@ -9316,14 +9298,12 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13908,49 +13888,39 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX7-LABEL: flat_agent_atomic_fsub_ret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr %ptr, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result @@ -14135,49 +14105,39 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst @@ -14378,49 +14338,39 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst @@ -14593,41 +14543,32 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX7-LABEL: flat_agent_atomic_fsub_noret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14809,41 +14750,32 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15045,41 +14977,32 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15272,49 +15195,39 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x half> %val seq_cst @@ -15496,41 +15409,32 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll index 7afdf102f5295..9a3dc507f295b 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll @@ -1039,20 +1039,26 @@ define half @v_max3_f16_maximumnum_maximumnum__v_v_v_0(half %a, half %b, half %c ; GFX6-LABEL: v_max3_f16_maximumnum_maximumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_max3_f16_maximumnum_maximumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1118,40 +1124,52 @@ define <2 x half> @v_max3_v2f16_maximumnum_maximumnum__v_v_v_0(<2 x half> %a, <2 ; GFX6-LABEL: v_max3_v2f16_maximumnum_maximumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_max3_f32 v3, v5, v4, v3 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_max3_v2f16_maximumnum_maximumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_max3_f32 v3, v5, v4, v3 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1237,23 +1255,32 @@ define <3 x half> @v_max3_v3f16_maximumnum_maximumnum__v_v_v_0(<3 x half> %a, <3 ; GFX6-LABEL: v_max3_v3f16_maximumnum_maximumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_max3_f32 v6, v8, v7, v6 -; GFX6-NEXT: v_max3_f32 v0, v0, v2, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v8 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_max3_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_max_f32_e32 v2, v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_max_f32_e32 v1, v1, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1262,23 +1289,32 @@ define <3 x half> @v_max3_v3f16_maximumnum_maximumnum__v_v_v_0(<3 x half> %a, <3 ; GFX7-LABEL: v_max3_v3f16_maximumnum_maximumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max3_f32 v6, v8, v7, v6 -; GFX7-NEXT: v_max3_f32 v0, v0, v2, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v8 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_max3_f32 v1, v1, v3, v5 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_max_f32_e32 v2, v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1397,70 +1433,94 @@ define <4 x half> @v_max3_v4f16_maximumnum_maximumnum__v_v_v_0(<4 x half> %a, <4 ; GFX6-LABEL: v_max3_v4f16_maximumnum_maximumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v11 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_max3_f32 v6, v8, v7, v6 -; GFX6-NEXT: v_max3_f32 v9, v11, v10, v9 -; GFX6-NEXT: v_max3_f32 v0, v0, v2, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v6 -; GFX6-NEXT: v_max3_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_max_f32_e32 v8, v10, v8 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v9 +; GFX6-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v2, v3, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v9 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_max3_v4f16_maximumnum_maximumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v11 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max3_f32 v6, v8, v7, v6 -; GFX7-NEXT: v_max3_f32 v9, v11, v10, v9 -; GFX7-NEXT: v_max3_f32 v0, v0, v2, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v6 -; GFX7-NEXT: v_max3_f32 v1, v1, v3, v5 +; GFX7-NEXT: v_max_f32_e32 v8, v10, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v9 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v2, v3, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v9 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index 38ab4c2712a2c..94f7eee4a6efb 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -400,27 +400,30 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: s_mov_b32 s22, s10 -; SI-NEXT: s_mov_b32 s23, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s20, s6 -; SI-NEXT: s_mov_b32 s21, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_max3_f32 v0, v0, v1, v2 +; SI-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -690,27 +693,30 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: s_mov_b32 s22, s10 -; SI-NEXT: s_mov_b32 s23, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s20, s6 -; SI-NEXT: s_mov_b32 s21, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_max3_f32 v0, v2, v0, v1 +; SI-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_max_f32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -977,20 +983,30 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 -; SI-NEXT: v_max_f32_e32 v1, v7, v6 -; SI-NEXT: v_max3_f32 v0, v2, v0, v3 -; SI-NEXT: v_max3_f32 v1, v5, v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_max_f32_e32 v5, v6, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_max_f32_e32 v0, v2, v0 +; SI-NEXT: v_max_f32_e32 v1, v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_max_f32_e32 v0, v0, v3 +; SI-NEXT: v_max_f32_e32 v1, v1, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll index b187f39c786aa..1b494deca08aa 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -26,10 +26,10 @@ define half @test_fmax_legacy_ugt_f16(half %a, half %b) #0 { ; SI-LABEL: test_fmax_legacy_ugt_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_max_legacy_f32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v3, v2 +; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_f16: @@ -119,15 +119,16 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_max_legacy_f32_e32 v0, v1, v0 -; SI-NEXT: v_max_legacy_f32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v5, v4 +; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v7, v6 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -176,14 +177,14 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16_fast(<2 x half> %a, <2 x half> %b) ; SI-LABEL: test_fmax_legacy_ugt_v2f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 -; SI-NEXT: v_max_f32_e32 v1, v3, v2 +; SI-NEXT: v_max_f32_e32 v1, v2, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -235,19 +236,21 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_max_legacy_f32_e32 v1, v3, v1 -; SI-NEXT: v_max_legacy_f32_e32 v0, v2, v0 -; SI-NEXT: v_max_legacy_f32_e32 v2, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v9, v8 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v7, v6 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v11, v10 +; SI-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -304,17 +307,17 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16_fast(<3 x half> %a, <3 x half> %b) ; SI-LABEL: test_fmax_legacy_ugt_v3f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_max_f32_e32 v1, v1, v3 ; SI-NEXT: v_max_f32_e32 v0, v0, v2 -; SI-NEXT: v_max_f32_e32 v2, v5, v4 +; SI-NEXT: v_max_f32_e32 v2, v4, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -381,28 +384,30 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; SI-LABEL: test_fmax_legacy_ugt_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_max_legacy_f32_e32 v1, v3, v1 -; SI-NEXT: v_max_legacy_f32_e32 v0, v2, v0 -; SI-NEXT: v_max_legacy_f32_e32 v2, v7, v6 -; SI-NEXT: v_max_legacy_f32_e32 v3, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v11, v10 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v9, v8 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v15, v14 +; SI-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v13, v12 +; SI-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -469,28 +474,28 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16_fast(<4 x half> %a, <4 x half> %b) ; SI-LABEL: test_fmax_legacy_ugt_v4f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_max_f32_e32 v1, v1, v3 ; SI-NEXT: v_max_f32_e32 v0, v0, v2 -; SI-NEXT: v_max_f32_e32 v2, v7, v6 -; SI-NEXT: v_max_f32_e32 v3, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_max_f32_e32 v2, v6, v7 +; SI-NEXT: v_max_f32_e32 v3, v4, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -581,50 +586,54 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; SI-LABEL: test_fmax_legacy_ugt_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_max_legacy_f32_e32 v3, v7, v3 -; SI-NEXT: v_max_legacy_f32_e32 v2, v6, v2 -; SI-NEXT: v_max_legacy_f32_e32 v1, v5, v1 -; SI-NEXT: v_max_legacy_f32_e32 v0, v4, v0 -; SI-NEXT: v_max_legacy_f32_e32 v4, v15, v14 -; SI-NEXT: v_max_legacy_f32_e32 v5, v13, v12 -; SI-NEXT: v_max_legacy_f32_e32 v6, v11, v10 -; SI-NEXT: v_max_legacy_f32_e32 v7, v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 +; SI-NEXT: v_cmp_nle_f32_e64 s[4:5], v19, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 +; SI-NEXT: v_cmp_nle_f32_e64 s[6:7], v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_cmp_nle_f32_e64 s[8:9], v19, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v12 +; SI-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[8:9] +; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 +; SI-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7] +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] +; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v4, v5 +; SI-NEXT: v_cndmask_b32_e32 v4, v9, v8, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v6, v7 +; SI-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v19, v18 +; SI-NEXT: v_cndmask_b32_e32 v6, v13, v12, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v17, v16 +; SI-NEXT: v_cndmask_b32_e32 v7, v15, v14, vcc +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v0, v0, v7 ; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: v_or_b32_e32 v2, v2, v5 @@ -721,50 +730,50 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16_fast(<8 x half> %a, <8 x half> %b) ; SI-LABEL: test_fmax_legacy_ugt_v8f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_max_f32_e32 v3, v3, v7 ; SI-NEXT: v_max_f32_e32 v2, v2, v6 ; SI-NEXT: v_max_f32_e32 v1, v1, v5 ; SI-NEXT: v_max_f32_e32 v0, v0, v4 -; SI-NEXT: v_max_f32_e32 v4, v15, v14 -; SI-NEXT: v_max_f32_e32 v5, v13, v12 -; SI-NEXT: v_max_f32_e32 v6, v11, v10 -; SI-NEXT: v_max_f32_e32 v7, v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v4, v14, v15 +; SI-NEXT: v_max_f32_e32 v5, v12, v13 +; SI-NEXT: v_max_f32_e32 v6, v10, v11 +; SI-NEXT: v_max_f32_e32 v7, v8, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v0, v0, v7 ; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: v_or_b32_e32 v2, v2, v5 diff --git a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll index d8014962eb3bd..7a89b58f239a9 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll @@ -17,25 +17,15 @@ declare float @llvm.fabs.f32(float) #0 declare half @llvm.fabs.f16(half) #0 define half @fmed3_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16: ; GFX8-SDAG: ; %bb.0: @@ -73,25 +63,15 @@ define half @fmed3_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 { } define half @fmed3_f32_fpext_f16_flags(half %arg0, half %arg1, half %arg2) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_flags: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_flags: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_flags: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_flags: ; GFX8-SDAG: ; %bb.0: @@ -129,29 +109,17 @@ define half @fmed3_f32_fpext_f16_flags(half %arg0, half %arg1, half %arg2) #1 { } define half @fmed3_f32_fpext_f16_multi_use(half %arg0, half %arg1, half %arg2, ptr addrspace(1) %ptr) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_med3_f32 v1, v0, v1, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v1 -; GFX7-SDAG-NEXT: flat_store_dword v[3:4], v1 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_med3_f32 v1, v0, v1, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1 -; GFX7-GISEL-NEXT: flat_store_dword v[3:4], v1 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_multi_use: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_med3_f32 v1, v0, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v1 +; GFX7-NEXT: flat_store_dword v[3:4], v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: fmed3_f32_fpext_f16_multi_use: ; GFX8: ; %bb.0: @@ -186,23 +154,14 @@ define half @fmed3_f32_fpext_f16_multi_use(half %arg0, half %arg1, half %arg2, p } define half @fmed3_f32_fpext_f16_k0(half %arg1, half %arg2) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k0: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_med3_f32 v0, 2.0, v0, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k0: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_med3_f32 v0, 2.0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_k0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_med3_f32 v0, 2.0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k0: ; GFX8-SDAG: ; %bb.0: @@ -238,23 +197,14 @@ define half @fmed3_f32_fpext_f16_k0(half %arg1, half %arg2) #1 { } define half @fmed3_f32_fpext_f16_k1(half %arg0, half %arg2) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k1: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k1: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_med3_f32 v0, v0, 2.0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_k1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_med3_f32 v0, v0, 2.0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k1: ; GFX8-SDAG: ; %bb.0: @@ -290,23 +240,14 @@ define half @fmed3_f32_fpext_f16_k1(half %arg0, half %arg2) #1 { } define half @fmed3_f32_fpext_f16_k2(half %arg0, half %arg1) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k2: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, 2.0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k2: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_med3_f32 v0, v0, v1, 2.0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_k2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_med3_f32 v0, v0, v1, 2.0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k2: ; GFX8-SDAG: ; %bb.0: @@ -526,25 +467,15 @@ define half @fmed3_fabs_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 { } define half @fmed3_f32_fpext_f16_fneg(half %arg0, half %arg1, half %arg2) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_fneg: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_med3_f32 v0, -v0, -v1, -v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_fneg: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, -v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v2, -v2 -; GFX7-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_fneg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX7-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; GFX7-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_fneg: ; GFX8-SDAG: ; %bb.0: @@ -585,45 +516,15 @@ define half @fmed3_f32_fpext_f16_fneg(half %arg0, half %arg1, half %arg2) #1 { } define half @fmed3_fneg_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 { -; GFX7-SDAG-LABEL: fmed3_fneg_f32_fpext_f16: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_med3_f32 v0, -v0, -v1, -v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_fneg_f32_fpext_f16: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_med3_f32 v0, -v0, -v1, -v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: fmed3_fneg_f32_fpext_f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX8-NEXT: v_med3_f32 v0, -v0, -v1, -v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmed3_fneg_f32_fpext_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX9-NEXT: v_med3_f32 v0, -v0, -v1, -v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fmed3_fneg_f32_fpext_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_med3_f32 v0, -v0, -v1, -v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %arg0.ext = fpext half %arg0 to float %arg1.ext = fpext half %arg1 to float %arg2.ext = fpext half %arg2 to float @@ -636,25 +537,15 @@ define half @fmed3_fneg_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 { } define half @fmed3_f32_fpext_f16_fneg_fabs(half %arg0, half %arg1, half %arg2) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_fneg_fabs: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; GFX7-SDAG-NEXT: v_med3_f32 v0, -v0, -v1, -v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_fneg_fabs: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, -|v1| -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v2, -|v2| -; GFX7-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_fneg_fabs: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; GFX7-NEXT: v_cvt_f32_f16_e64 v1, -|v1| +; GFX7-NEXT: v_cvt_f32_f16_e64 v2, -|v2| +; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_fneg_fabs: ; GFX8-SDAG: ; %bb.0: @@ -758,9 +649,9 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar ; GFX7-LABEL: fmed3_f32_fpext_f16_fptrunc_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -805,29 +696,17 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar } define half @fmed3_f32_fpext_f16_multi_use_0(half %arg0, half %arg1, half %arg2, ptr addrspace(1) %ptr) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_0: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-SDAG-NEXT: v_med3_f32 v0, v5, v1, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: flat_store_dword v[3:4], v5 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use_0: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX7-GISEL-NEXT: flat_store_dword v[3:4], v5 -; GFX7-GISEL-NEXT: v_med3_f32 v0, v5, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_multi_use_0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: flat_store_dword v[3:4], v5 +; GFX7-NEXT: v_med3_f32 v0, v5, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_0: ; GFX8-SDAG: ; %bb.0: @@ -874,29 +753,17 @@ define half @fmed3_f32_fpext_f16_multi_use_0(half %arg0, half %arg1, half %arg2, } define half @fmed3_f32_fpext_f16_multi_use_1(half %arg0, half %arg1, half %arg2, ptr addrspace(1) %ptr) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_1: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: flat_store_dword v[3:4], v1 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use_1: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: flat_store_dword v[3:4], v1 -; GFX7-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_multi_use_1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: flat_store_dword v[3:4], v1 +; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_1: ; GFX8-SDAG: ; %bb.0: @@ -943,29 +810,17 @@ define half @fmed3_f32_fpext_f16_multi_use_1(half %arg0, half %arg1, half %arg2, } define half @fmed3_f32_fpext_f16_multi_use_2(half %arg0, half %arg1, half %arg2, ptr addrspace(1) %ptr) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_2: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: flat_store_dword v[3:4], v2 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use_2: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: flat_store_dword v[3:4], v2 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_multi_use_2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: flat_store_dword v[3:4], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_2: ; GFX8-SDAG: ; %bb.0: @@ -1030,35 +885,15 @@ define half @fmed3_f32_fpext_bf16(bfloat %arg0, bfloat %arg1, bfloat %arg2) #1 { } define half @fmed3_f32_fpext_f16_bf16_0(bfloat %arg0, half %arg1, half %arg2) #1 { -; GFX7-LABEL: fmed3_f32_fpext_f16_bf16_0: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: fmed3_f32_fpext_f16_bf16_0: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmed3_f32_fpext_f16_bf16_0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fmed3_f32_fpext_f16_bf16_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_med3_f32 v0, v0, v1, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %arg0.ext = fpext bfloat %arg0 to float %arg1.ext = fpext half %arg1 to float %arg2.ext = fpext half %arg2 to float @@ -1068,35 +903,15 @@ define half @fmed3_f32_fpext_f16_bf16_0(bfloat %arg0, half %arg1, half %arg2) #1 } define half @fmed3_f32_fpext_f16_bf16_1(half %arg0, bfloat %arg1, half %arg2) #1 { -; GFX7-LABEL: fmed3_f32_fpext_f16_bf16_1: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: fmed3_f32_fpext_f16_bf16_1: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmed3_f32_fpext_f16_bf16_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fmed3_f32_fpext_f16_bf16_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_med3_f32 v0, v0, v1, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %arg0.ext = fpext half %arg0 to float %arg1.ext = fpext bfloat %arg1 to float %arg2.ext = fpext half %arg2 to float @@ -1106,35 +921,15 @@ define half @fmed3_f32_fpext_f16_bf16_1(half %arg0, bfloat %arg1, half %arg2) #1 } define half @fmed3_f32_fpext_f16_bf16_2(half %arg0, half %arg1, bfloat %arg2) #1 { -; GFX7-LABEL: fmed3_f32_fpext_f16_bf16_2: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: fmed3_f32_fpext_f16_bf16_2: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmed3_f32_fpext_f16_bf16_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fmed3_f32_fpext_f16_bf16_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_med3_f32 v0, v0, v1, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %arg0.ext = fpext half %arg0 to float %arg1.ext = fpext half %arg1 to float %arg2.ext = fpext bfloat %arg2 to float @@ -1147,8 +942,8 @@ define half @fmed3_f32_fpext_f16_unrepresentable_k0(half %arg1, half %arg2) #1 { ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k0: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: s_mov_b32 s4, 0x4f800000 ; GFX7-SDAG-NEXT: v_med3_f32 v0, s4, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1214,8 +1009,8 @@ define half @fmed3_f32_fpext_f16_unrepresentable_k1(half %arg0, half %arg2) #1 { ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k1: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: s_mov_b32 s4, 0x4f800000 ; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, s4, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1281,8 +1076,8 @@ define half @fmed3_f32_fpext_f16_unrepresentable_k2(half %arg0, half %arg1) #1 { ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k2: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: s_mov_b32 s4, 0x4f800000 ; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, s4 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index b37ab370d0bbf..668347eb97004 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -7449,7 +7449,12 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-SDAG-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_max_f32_e32 v2, 2.0, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_min_f32_e32 v2, 4.0, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-SDAG-NEXT: s_endpgm @@ -7624,7 +7629,22 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; SI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-SDAG-NEXT: v_add_f32_e32 v3, 2.0, v3 ; SI-SDAG-NEXT: v_add_f32_e32 v4, 4.0, v4 -; SI-SDAG-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-SDAG-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_max_f32_e32 v2, v3, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-SDAG-NEXT: s_endpgm @@ -8694,24 +8714,16 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) { } define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) { -; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_max_f32_e32 v0, 2.0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_min_f32_e32 v0, 4.0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_max_f32_e32 v0, 2.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum: ; VI-SDAG: ; %bb.0: @@ -8778,12 +8790,18 @@ define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a) ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_max_f32_e32 v0, 2.0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; SI-SDAG-NEXT: v_max_f32_e32 v1, 2.0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_min_f32_e32 v0, 4.0, v0 +; SI-SDAG-NEXT: v_min_f32_e32 v1, 4.0, v1 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -8994,11 +9012,30 @@ define <2 x half> @v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum(<2 x half ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 -; SI-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_max_f32_e32 v3, 2.0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; SI-SDAG-NEXT: v_max_f32_e32 v3, 2.0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_min_f32_e32 v3, 4.0, v0 +; SI-SDAG-NEXT: v_min_f32_e32 v4, 4.0, v1 +; SI-SDAG-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; SI-SDAG-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -9182,8 +9219,18 @@ define half @v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum(half %a) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; SI-SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_max_f32_e32 v2, 2.0, v0 +; SI-SDAG-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_min_f32_e32 v2, 4.0, v0 +; SI-SDAG-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll index bb6b20df0c149..4f6369078c386 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll @@ -1039,20 +1039,26 @@ define half @v_min3_f16_minimumnum_minimumnum__v_v_v_0(half %a, half %b, half %c ; GFX6-LABEL: v_min3_f16_minimumnum_minimumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_min3_f16_minimumnum_minimumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1118,40 +1124,52 @@ define <2 x half> @v_min3_v2f16_minimumnum_minimumnum__v_v_v_0(<2 x half> %a, <2 ; GFX6-LABEL: v_min3_v2f16_minimumnum_minimumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_min3_f32 v3, v5, v4, v3 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v1, v3, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_min3_v2f16_minimumnum_minimumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_min3_f32 v3, v5, v4, v3 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_min_f32_e32 v1, v3, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1237,23 +1255,32 @@ define <3 x half> @v_min3_v3f16_minimumnum_minimumnum__v_v_v_0(<3 x half> %a, <3 ; GFX6-LABEL: v_min3_v3f16_minimumnum_minimumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_min3_f32 v6, v8, v7, v6 -; GFX6-NEXT: v_min3_f32 v0, v0, v2, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v8 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_min3_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_min_f32_e32 v2, v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_min_f32_e32 v1, v1, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1262,23 +1289,32 @@ define <3 x half> @v_min3_v3f16_minimumnum_minimumnum__v_v_v_0(<3 x half> %a, <3 ; GFX7-LABEL: v_min3_v3f16_minimumnum_minimumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min3_f32 v6, v8, v7, v6 -; GFX7-NEXT: v_min3_f32 v0, v0, v2, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v8 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_min3_f32 v1, v1, v3, v5 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_min_f32_e32 v2, v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1397,70 +1433,94 @@ define <4 x half> @v_min3_v4f16_minimumnum_minimumnum__v_v_v_0(<4 x half> %a, <4 ; GFX6-LABEL: v_min3_v4f16_minimumnum_minimumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v11 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_min3_f32 v6, v8, v7, v6 -; GFX6-NEXT: v_min3_f32 v9, v11, v10, v9 -; GFX6-NEXT: v_min3_f32 v0, v0, v2, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v6 -; GFX6-NEXT: v_min3_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_min_f32_e32 v8, v10, v8 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v9 +; GFX6-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v2, v3, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v9 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_min3_v4f16_minimumnum_minimumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v11 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min3_f32 v6, v8, v7, v6 -; GFX7-NEXT: v_min3_f32 v9, v11, v10, v9 -; GFX7-NEXT: v_min3_f32 v0, v0, v2, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v6 -; GFX7-NEXT: v_min3_f32 v1, v1, v3, v5 +; GFX7-NEXT: v_min_f32_e32 v8, v10, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v9 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v2, v3, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v9 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index fee2fad933158..6be2eb93ee25c 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -400,27 +400,30 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: s_mov_b32 s22, s10 -; SI-NEXT: s_mov_b32 s23, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s20, s6 -; SI-NEXT: s_mov_b32 s21, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_min3_f32 v0, v0, v1, v2 +; SI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -690,27 +693,30 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: s_mov_b32 s22, s10 -; SI-NEXT: s_mov_b32 s23, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s20, s6 -; SI-NEXT: s_mov_b32 s21, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_min3_f32 v0, v2, v0, v1 +; SI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_min_f32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -977,20 +983,30 @@ define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 -; SI-NEXT: v_min_f32_e32 v1, v7, v6 -; SI-NEXT: v_min3_f32 v0, v2, v0, v3 -; SI-NEXT: v_min3_f32 v1, v5, v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_min_f32_e32 v5, v6, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_min_f32_e32 v0, v2, v0 +; SI-NEXT: v_min_f32_e32 v1, v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_min_f32_e32 v0, v0, v3 +; SI-NEXT: v_min_f32_e32 v1, v1, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll index dd77eb6f364a7..8c9dccceff192 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -27,10 +27,10 @@ define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 { ; SI-LABEL: test_fmin_legacy_ule_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_min_legacy_f32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_f16: @@ -120,15 +120,16 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_min_legacy_f32_e32 v0, v1, v0 -; SI-NEXT: v_min_legacy_f32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4 +; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -177,14 +178,14 @@ define <2 x half> @test_fmin_legacy_ule_v2f16_fast(<2 x half> %a, <2 x half> %b) ; SI-LABEL: test_fmin_legacy_ule_v2f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 -; SI-NEXT: v_min_f32_e32 v1, v3, v2 +; SI-NEXT: v_min_f32_e32 v1, v2, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -236,19 +237,21 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_min_legacy_f32_e32 v1, v3, v1 -; SI-NEXT: v_min_legacy_f32_e32 v0, v2, v0 -; SI-NEXT: v_min_legacy_f32_e32 v2, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10 +; SI-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -305,17 +308,17 @@ define <3 x half> @test_fmin_legacy_ule_v3f16_fast(<3 x half> %a, <3 x half> %b) ; SI-LABEL: test_fmin_legacy_ule_v3f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_min_f32_e32 v1, v1, v3 ; SI-NEXT: v_min_f32_e32 v0, v0, v2 -; SI-NEXT: v_min_f32_e32 v2, v5, v4 +; SI-NEXT: v_min_f32_e32 v2, v4, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -382,28 +385,30 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; SI-LABEL: test_fmin_legacy_ule_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_min_legacy_f32_e32 v1, v3, v1 -; SI-NEXT: v_min_legacy_f32_e32 v0, v2, v0 -; SI-NEXT: v_min_legacy_f32_e32 v2, v7, v6 -; SI-NEXT: v_min_legacy_f32_e32 v3, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v4 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v15, v14 +; SI-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12 +; SI-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -470,28 +475,28 @@ define <4 x half> @test_fmin_legacy_ule_v4f16_fast(<4 x half> %a, <4 x half> %b) ; SI-LABEL: test_fmin_legacy_ule_v4f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_min_f32_e32 v1, v1, v3 ; SI-NEXT: v_min_f32_e32 v0, v0, v2 -; SI-NEXT: v_min_f32_e32 v2, v7, v6 -; SI-NEXT: v_min_f32_e32 v3, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_min_f32_e32 v2, v6, v7 +; SI-NEXT: v_min_f32_e32 v3, v4, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -582,50 +587,54 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; SI-LABEL: test_fmin_legacy_ule_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_min_legacy_f32_e32 v3, v7, v3 -; SI-NEXT: v_min_legacy_f32_e32 v2, v6, v2 -; SI-NEXT: v_min_legacy_f32_e32 v1, v5, v1 -; SI-NEXT: v_min_legacy_f32_e32 v0, v4, v0 -; SI-NEXT: v_min_legacy_f32_e32 v4, v15, v14 -; SI-NEXT: v_min_legacy_f32_e32 v5, v13, v12 -; SI-NEXT: v_min_legacy_f32_e32 v6, v11, v10 -; SI-NEXT: v_min_legacy_f32_e32 v7, v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 +; SI-NEXT: v_cmp_ngt_f32_e64 s[4:5], v19, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 +; SI-NEXT: v_cmp_ngt_f32_e64 s[6:7], v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_cmp_ngt_f32_e64 s[8:9], v19, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v12 +; SI-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[8:9] +; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 +; SI-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7] +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] +; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v5 +; SI-NEXT: v_cndmask_b32_e32 v4, v9, v8, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v7 +; SI-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v19, v18 +; SI-NEXT: v_cndmask_b32_e32 v6, v13, v12, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v17, v16 +; SI-NEXT: v_cndmask_b32_e32 v7, v15, v14, vcc +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v0, v0, v7 ; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: v_or_b32_e32 v2, v2, v5 @@ -722,50 +731,50 @@ define <8 x half> @test_fmin_legacy_ule_v8f16_fast(<8 x half> %a, <8 x half> %b) ; SI-LABEL: test_fmin_legacy_ule_v8f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_min_f32_e32 v3, v3, v7 ; SI-NEXT: v_min_f32_e32 v2, v2, v6 ; SI-NEXT: v_min_f32_e32 v1, v1, v5 ; SI-NEXT: v_min_f32_e32 v0, v0, v4 -; SI-NEXT: v_min_f32_e32 v4, v15, v14 -; SI-NEXT: v_min_f32_e32 v5, v13, v12 -; SI-NEXT: v_min_f32_e32 v6, v11, v10 -; SI-NEXT: v_min_f32_e32 v7, v9, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v4, v14, v15 +; SI-NEXT: v_min_f32_e32 v5, v12, v13 +; SI-NEXT: v_min_f32_e32 v6, v10, v11 +; SI-NEXT: v_min_f32_e32 v7, v8, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v0, v0, v7 ; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: v_or_b32_e32 v2, v2, v5 diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll index f8719936b2d0a..082006898b436 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll @@ -598,31 +598,31 @@ define amdgpu_kernel void @fmul_v4f16( ; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_mul_f32_e32 v5, v7, v5 -; SI-NEXT: v_mul_f32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_mul_f32_e32 v1, v3, v1 ; SI-NEXT: v_mul_f32_e32 v0, v2, v0 +; SI-NEXT: v_mul_f32_e32 v2, v7, v5 +; SI-NEXT: v_mul_f32_e32 v3, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -713,37 +713,37 @@ entry: define amdgpu_kernel void @fmul_v4f16_imm_a( ; SI-LABEL: fmul_v4f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v3, 0x40400000, v3 -; SI-NEXT: v_mul_f32_e32 v2, 0x41000000, v2 -; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1 -; SI-NEXT: v_add_f32_e32 v0, v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v3, 4.0, v3 +; SI-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-NEXT: v_mul_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_mul_f32_e32 v0, 0x41000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fmul_v4f16_imm_a: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index 2079ee54653ce..16ec854a12c53 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -22,13 +22,13 @@ define half @v_fneg_add_f16(half %a, half %b) #0 { ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -84,9 +84,9 @@ define { half, half } @v_fneg_add_store_use_add_f16(half %a, half %b) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_add_store_use_add_f16: @@ -131,20 +131,22 @@ define { half, half } @v_fneg_add_multi_use_add_f16(half %a, half %b) #0 { ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_multi_use_add_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v0 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; SI-NSZ-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; @@ -210,7 +212,8 @@ define half @v_fneg_add_fneg_x_f16(half %a, half %b) #0 { ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_sub_f32_e32 v0, v1, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_fneg_x_f16: @@ -273,7 +276,8 @@ define half @v_fneg_add_x_fneg_f16(half %a, half %b) #0 { ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_x_fneg_f16: @@ -333,11 +337,11 @@ define half @v_fneg_add_fneg_fneg_f16(half %a, half %b) #0 { ; SI-SAFE-LABEL: v_fneg_add_fneg_fneg_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-SAFE-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_fneg_fneg_f16: @@ -400,11 +404,10 @@ define { half, half } @v_fneg_add_store_use_fneg_x_f16(half %a, half %b) #0 { ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-SAFE-NEXT: v_sub_f32_e32 v1, v1, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v2, -v1 -; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0 -; SI-SAFE-NEXT: v_mov_b32_e32 v0, v2 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0xffff8000, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v2 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_store_use_fneg_x_f16: @@ -412,10 +415,9 @@ define { half, half } @v_fneg_add_store_use_fneg_x_f16(half %a, half %b) #0 { ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NSZ-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NSZ-NEXT: v_sub_f32_e32 v1, v2, v1 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NSZ-NEXT: v_xor_b32_e32 v1, 0x8000, v0 +; SI-NSZ-NEXT: v_xor_b32_e32 v1, 0xffff8000, v0 ; SI-NSZ-NEXT: v_mov_b32_e32 v0, v2 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; @@ -483,28 +485,27 @@ define { half, half } @v_fneg_add_multi_use_fneg_x_f16(half %a, half %b, half %c ; SI-SAFE-LABEL: v_fneg_add_multi_use_fneg_x_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_and_b32_e32 v3, 0xffff, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v3, -v3 -; SI-SAFE-NEXT: v_sub_f32_e32 v0, v1, v0 -; SI-SAFE-NEXT: v_mul_f32_e32 v1, v3, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-SAFE-NEXT: v_sub_f32_e32 v1, v1, v3 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v3 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_multi_use_fneg_x_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_and_b32_e32 v3, 0xffff, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v3, -v3 -; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-NSZ-NEXT: v_mul_f32_e32 v1, v3, v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v4, -v0 +; SI-NSZ-NEXT: v_sub_f32_e32 v0, v3, v1 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_mul_f32_e32 v1, v4, v2 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; @@ -574,28 +575,38 @@ define amdgpu_ps half @fneg_fadd_0_safe_f16(half inreg %tmp2, half inreg %tmp6, ; SI-LABEL: fneg_fadd_0_safe_f16: ; SI: ; %bb.0: ; %.entry ; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0 -; SI-NEXT: v_rcp_f32_e32 v3, v2 -; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; SI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, 1.0 +; SI-NEXT: v_rcp_f32_e32 v2, v1 +; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; SI-NEXT: v_fma_f32 v3, v5, v3, v3 -; SI-NEXT: v_mul_f32_e32 v5, v4, v3 -; SI-NEXT: v_fma_f32 v6, -v2, v5, v4 -; SI-NEXT: v_fma_f32 v5, v6, v3, v5 -; SI-NEXT: v_fma_f32 v2, -v2, v5, v4 +; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; SI-NEXT: v_fma_f32 v2, v4, v2, v2 +; SI-NEXT: v_mul_f32_e32 v4, v3, v2 +; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 +; SI-NEXT: v_fma_f32 v4, v5, v2, v4 +; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; SI-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; SI-NEXT: v_mad_f32 v0, v0, 0, 0 -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 -; SI-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; SI-NEXT: v_mov_b32_e32 v3, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v2, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s0, 0, 0x7e00 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: fneg_fadd_0_safe_f16: @@ -644,14 +655,20 @@ define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, < ; SI: ; %bb.0: ; %.entry ; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; SI-NEXT: v_mov_b32_e32 v3, s0 ; SI-NEXT: v_rcp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 -; SI-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v2, -v0 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v2, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s0, 0, 0x7e00 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: fneg_fadd_0_nsz_f16: @@ -699,9 +716,8 @@ define half @v_fneg_mul_f16(half %a, half %b) #0 { ; SI-LABEL: v_fneg_mul_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -738,9 +754,9 @@ define { half, half } @v_fneg_mul_store_use_mul_f16(half %a, half %b) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v1, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_mul_store_use_mul_f16: @@ -782,12 +798,12 @@ define { half, half } @v_fneg_mul_multi_use_mul_f16(half %a, half %b) #0 { ; SI-LABEL: v_fneg_mul_multi_use_mul_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -905,9 +921,8 @@ define half @v_fneg_mul_fneg_fneg_f16(half %a, half %b) #0 { ; SI-LABEL: v_fneg_mul_fneg_fneg_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -946,10 +961,9 @@ define { half, half } @v_fneg_mul_store_use_fneg_x_f16(half %a, half %b) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mul_f32_e32 v1, v2, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_xor_b32_e32 v1, 0x8000, v0 +; SI-NEXT: v_xor_b32_e32 v1, 0xffff8000, v0 ; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -997,14 +1011,13 @@ define { half, half } @v_fneg_mul_multi_use_fneg_x_f16(half %a, half %b, half %c ; SI-LABEL: v_fneg_mul_multi_use_fneg_x_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v3, -v3 -; SI-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NEXT: v_mul_f32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e64 v4, -v0 +; SI-NEXT: v_mul_f32_e32 v0, v3, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v1, v4, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1057,8 +1070,6 @@ define half @v_fneg_minnum_f16_ieee(half %a, half %b) #0 { ; SI-LABEL: v_fneg_minnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 @@ -1106,8 +1117,6 @@ define half @v_fneg_minnum_f16_no_ieee(half %a, half %b) #4 { ; SI-LABEL: v_fneg_minnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 @@ -1144,8 +1153,7 @@ define half @v_fneg_self_minnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_self_minnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_self_minnum_f16_ieee: @@ -1178,8 +1186,7 @@ define half @v_fneg_self_minnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_self_minnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_self_minnum_f16_no_ieee: @@ -1212,7 +1219,6 @@ define half @v_fneg_posk_minnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_posk_minnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, -4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1255,7 +1261,6 @@ define half @v_fneg_posk_minnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_posk_minnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, -4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1291,7 +1296,6 @@ define half @v_fneg_negk_minnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_negk_minnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1334,7 +1338,6 @@ define half @v_fneg_negk_minnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_negk_minnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1372,7 +1375,8 @@ define half @v_fneg_0_minnum_f16(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_0_minnum_f16: @@ -1412,7 +1416,6 @@ define half @v_fneg_neg0_minnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_neg0_minnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1455,7 +1458,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_inv2pi_minnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1502,7 +1504,6 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_neg_inv2pi_minnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1549,7 +1550,6 @@ define half @v_fneg_neg0_minnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_neg0_minnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1588,7 +1588,9 @@ define half @v_fneg_0_minnum_foldable_use_f16_ieee(half %a, half %b) #0 { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 -; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1634,10 +1636,11 @@ define half @v_fneg_inv2pi_minnum_foldable_use_f16(half %a, half %b) #0 { ; SI-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1687,7 +1690,9 @@ define half @v_fneg_0_minnum_foldable_use_f16_no_ieee(half %a, half %b) #4 { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 -; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1729,13 +1734,12 @@ define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b) ; SI-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 -; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1784,18 +1788,34 @@ define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b) } define <2 x half> @v_fneg_minnum_multi_use_minnum_f16_no_ieee(half %a, half %b) #4 { -; SI-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_min_f32_e32 v0, v0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 4.0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-SAFE-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee: +; SI-SAFE: ; %bb.0: +; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SAFE-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-NSZ-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee: +; SI-NSZ: ; %bb.0: +; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, v0 mul:4 +; SI-NSZ-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee: ; VI: ; %bb.0: @@ -1846,8 +1866,6 @@ define half @v_fneg_maxnum_f16_ieee(half %a, half %b) #0 { ; SI-LABEL: v_fneg_maxnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 @@ -1895,8 +1913,6 @@ define half @v_fneg_maxnum_f16_no_ieee(half %a, half %b) #4 { ; SI-LABEL: v_fneg_maxnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 @@ -1933,8 +1949,7 @@ define half @v_fneg_self_maxnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_self_maxnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_self_maxnum_f16_ieee: @@ -1967,8 +1982,7 @@ define half @v_fneg_self_maxnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_self_maxnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_self_maxnum_f16_no_ieee: @@ -2001,7 +2015,6 @@ define half @v_fneg_posk_maxnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_posk_maxnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, -4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2044,7 +2057,6 @@ define half @v_fneg_posk_maxnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_posk_maxnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, -4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2080,7 +2092,6 @@ define half @v_fneg_negk_maxnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_negk_maxnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2123,7 +2134,6 @@ define half @v_fneg_negk_maxnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_negk_maxnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2161,7 +2171,8 @@ define half @v_fneg_0_maxnum_f16(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_0_maxnum_f16: @@ -2201,7 +2212,6 @@ define half @v_fneg_neg0_maxnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_neg0_maxnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2244,7 +2254,6 @@ define half @v_fneg_neg0_maxnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_neg0_maxnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2283,7 +2292,9 @@ define half @v_fneg_0_maxnum_foldable_use_f16_ieee(half %a, half %b) #0 { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 -; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2332,7 +2343,9 @@ define half @v_fneg_0_maxnum_foldable_use_f16_no_ieee(half %a, half %b) #4 { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 -; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2374,13 +2387,12 @@ define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b) ; SI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 -; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2429,18 +2441,34 @@ define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b) } define <2 x half> @v_fneg_maxnum_multi_use_maxnum_f16_no_ieee(half %a, half %b) #4 { -; SI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_max_f32_e32 v0, v0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 4.0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-SAFE-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee: +; SI-SAFE: ; %bb.0: +; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SAFE-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-NSZ-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee: +; SI-NSZ: ; %bb.0: +; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, v0 mul:4 +; SI-NSZ-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee: ; VI: ; %bb.0: @@ -2492,20 +2520,111 @@ define half @v_fneg_fma_f16(half %a, half %b, half %c) #0 { ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-SAFE-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-SAFE-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, -v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v3, -v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-NSZ-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NSZ-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NSZ-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NSZ-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_f16: @@ -2556,11 +2675,56 @@ define { half, half } @v_fneg_fma_store_use_fma_f16(half %a, half %b, half %c) # ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_fma_f32 v1, v0, v1, v2 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_movk_i32 s4, 0x40f +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_fma_store_use_fma_f16: @@ -2603,23 +2767,116 @@ define { half, half } @v_fneg_fma_multi_use_fma_f16(half %a, half %b, half %c) # ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, v2 -; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-SAFE-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-SAFE-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_multi_use_fma_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, -v2 -; SI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v0 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v3, -v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-NSZ-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NSZ-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NSZ-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NSZ-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; SI-NSZ-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; @@ -2682,20 +2939,111 @@ define half @v_fneg_fma_fneg_x_y_f16(half %a, half %b, half %c) #0 { ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_fma_f32 v0, -v0, v1, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v4, -v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-SAFE-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-SAFE-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_fneg_x_y_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, -v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-NSZ-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NSZ-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NSZ-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NSZ-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_fneg_x_y_f16: @@ -2747,20 +3095,111 @@ define half @v_fneg_fma_x_fneg_y_f16(half %a, half %b, half %c) #0 { ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_fma_f32 v0, v0, -v1, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v3, -v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-SAFE-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-SAFE-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_x_fneg_y_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, -v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-NSZ-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NSZ-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NSZ-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NSZ-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_x_fneg_y_f16: @@ -2812,20 +3251,111 @@ define half @v_fneg_fma_fneg_fneg_y_f16(half %a, half %b, half %c) #0 { ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-SAFE-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-SAFE-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_fneg_fneg_y_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, -v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v3, -v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-NSZ-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NSZ-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NSZ-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NSZ-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_fneg_fneg_y_f16: @@ -2877,21 +3407,112 @@ define half @v_fneg_fma_fneg_x_fneg_f16(half %a, half %b, half %c) #0 { ; SI-SAFE-LABEL: v_fneg_fma_fneg_x_fneg_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_fma_f32 v0, -v0, v1, -v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v4, -v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-SAFE-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-SAFE-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_fneg_x_fneg_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-NSZ-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NSZ-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NSZ-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NSZ-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_fneg_x_fneg_f16: @@ -2943,21 +3564,112 @@ define half @v_fneg_fma_x_y_fneg_f16(half %a, half %b, half %c) #0 { ; SI-SAFE-LABEL: v_fneg_fma_x_y_fneg_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, -v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-SAFE-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-SAFE-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_x_y_fneg_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v3, -v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-NSZ-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NSZ-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NSZ-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NSZ-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_x_y_fneg_f16: @@ -3008,26 +3720,115 @@ define { half, half } @v_fneg_fma_store_use_fneg_x_y_f16(half %a, half %b, half ; SI-SAFE-LABEL: v_fneg_fma_store_use_fneg_x_y_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-SAFE-NEXT: v_fma_f32 v0, v0, v3, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v3, 0xffff8000, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[4:5], v2 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[6:7], v[4:5], v[0:1] +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-SAFE-NEXT: v_bfe_u32 v4, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v5, vcc, s4, v4 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v5, v5, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v6, v5, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v5, v5, v6 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v5, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v4, vcc, s4, v4 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v5, 12, v4 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v5, v0, v5 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v5, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; SI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v6, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; SI-SAFE-NEXT: v_mov_b32_e32 v1, v3 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_store_use_fneg_x_y_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NSZ-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NSZ-NEXT: v_fma_f32 v1, v3, v1, -v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NSZ-NEXT: v_xor_b32_e32 v1, 0x8000, v0 -; SI-NSZ-NEXT: v_mov_b32_e32 v0, v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 +; SI-NSZ-NEXT: v_fma_f64 v[2:3], v[5:6], v[3:4], v[1:2] +; SI-NSZ-NEXT: v_xor_b32_e32 v1, 0xffff8000, v0 +; SI-NSZ-NEXT: v_and_b32_e32 v0, 0x1ff, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-NSZ-NEXT: v_bfe_u32 v4, v3, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_sub_i32_e32 v5, vcc, s4, v4 +; SI-NSZ-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NSZ-NEXT: v_med3_i32 v5, v5, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v6, v5, v2 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v5, v5, v6 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v5, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v4, vcc, s4, v4 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v5, 12, v4 +; SI-NSZ-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v5, v0, v5 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v5, 7, v2 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NSZ-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; SI-NSZ-NEXT: v_mov_b32_e32 v5, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v6, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_store_use_fneg_x_y_f16: @@ -3094,30 +3895,119 @@ define { half, half } @v_fneg_fma_multi_use_fneg_x_y_f16(half %a, half %b, half ; SI-SAFE-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v8, -v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[6:7], v8 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_fma_f32 v1, v0, v1, v2 -; SI-SAFE-NEXT: v_mul_f32_e32 v2, v0, v3 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[6:7], v[4:5], v[0:1] +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-SAFE-NEXT: v_bfe_u32 v4, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v5, vcc, s4, v4 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v5, v5, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v6, v5, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v5, v5, v6 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v5, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v4, vcc, s4, v4 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v5, 12, v4 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v5, v0, v5 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v5, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; SI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v6, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x8000, v1 +; SI-SAFE-NEXT: v_mul_f32_e32 v1, v8, v3 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v6, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v4, -v4 -; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, -v2 -; SI-NSZ-NEXT: v_mul_f32_e32 v1, v4, v3 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NSZ-NEXT: v_fma_f64 v[1:2], v[6:7], v[4:5], v[1:2] +; SI-NSZ-NEXT: v_and_b32_e32 v4, 0x1ff, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v4, 0xffe, v4 +; SI-NSZ-NEXT: v_bfe_u32 v5, v2, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NSZ-NEXT: v_sub_i32_e32 v6, vcc, s4, v5 +; SI-NSZ-NEXT: v_or_b32_e32 v4, 0x1000, v1 +; SI-NSZ-NEXT: v_med3_i32 v6, v6, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v7, v6, v4 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v6, v6, v7 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v6, v4 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v5, vcc, s4, v5 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v6, 12, v5 +; SI-NSZ-NEXT: v_or_b32_e32 v4, v7, v4 +; SI-NSZ-NEXT: v_or_b32_e32 v6, v1, v6 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v6, 7, v4 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v4, 2, v4 +; SI-NSZ-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; SI-NSZ-NEXT: v_mov_b32_e32 v6, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v7, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v5 +; SI-NSZ-NEXT: v_mul_f32_e32 v0, v0, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NSZ-NEXT: v_and_b32_e32 v0, 0x8000, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v0, v4 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16: @@ -3189,21 +4079,27 @@ define half @v_fneg_fmad_f16(half %a, half %b, half %c) #0 { ; SI-SAFE-LABEL: v_fneg_fmad_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_mac_f32_e32 v2, v0, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v2 +; SI-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fmad_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v1 -; SI-NSZ-NEXT: v_mad_f32 v0, v0, v1, -v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; @@ -3254,35 +4150,47 @@ define <4 x half> @v_fneg_fmad_v4f32(<4 x half> %a, <4 x half> %b, <4 x half> %c ; SI-SAFE-LABEL: v_fneg_fmad_v4f32: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-SAFE-NEXT: v_mul_f32_e32 v6, v6, v7 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SAFE-NEXT: v_add_f32_e32 v6, v6, v7 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v9 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-SAFE-NEXT: v_mul_f32_e32 v3, v7, v3 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_mac_f32_e32 v9, v11, v10 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-SAFE-NEXT: v_mac_f32_e32 v5, v1, v3 -; SI-SAFE-NEXT: v_mac_f32_e32 v6, v8, v7 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-SAFE-NEXT: v_mac_f32_e32 v4, v0, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-SAFE-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-SAFE-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-SAFE-NEXT: v_add_f32_e32 v1, v1, v5 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SAFE-NEXT: v_add_f32_e32 v0, v0, v4 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-SAFE-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 @@ -3291,38 +4199,50 @@ define <4 x half> @v_fneg_fmad_v4f32(<4 x half> %a, <4 x half> %b, <4 x half> %c ; SI-NSZ-LABEL: v_fneg_fmad_v4f32: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; SI-NSZ-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NSZ-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; SI-NSZ-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 -; SI-NSZ-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NSZ-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; SI-NSZ-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NSZ-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NSZ-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NSZ-NEXT: v_mul_f32_e32 v6, v6, v7 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_mul_f32_e32 v7, v7, v9 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NSZ-NEXT: v_mul_f32_e32 v1, v1, v3 +; SI-NSZ-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v10 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NSZ-NEXT: v_mad_f32 v6, v7, v11, -v6 -; SI-NSZ-NEXT: v_mad_f32 v8, v9, v10, -v8 -; SI-NSZ-NEXT: v_mad_f32 v0, v0, v2, -v4 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v6 -; SI-NSZ-NEXT: v_mad_f32 v1, v1, v3, -v5 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NSZ-NEXT: v_sub_f32_e32 v6, v6, v8 +; SI-NSZ-NEXT: v_sub_f32_e32 v2, v7, v2 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NSZ-NEXT: v_sub_f32_e32 v1, v1, v3 +; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v4 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v8 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; SI-NSZ-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NSZ-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NSZ-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NSZ-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fmad_v4f32: @@ -3390,25 +4310,33 @@ define { half, half } @v_fneg_fmad_multi_use_fmad_f16(half %a, half %b, half %c) ; SI-SAFE-LABEL: v_fneg_fmad_multi_use_fmad_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_mac_f32_e32 v2, v0, v1 -; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v2 +; SI-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fmad_multi_use_fmad_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v1 -; SI-NSZ-NEXT: v_mad_f32 v0, v0, v1, -v2 -; SI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; SI-NSZ-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; @@ -3474,7 +4402,6 @@ define double @v_fneg_fp_extend_f16_to_f64(half %a) #0 { ; SI-LABEL: v_fneg_fp_extend_f16_to_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3564,8 +4491,7 @@ define { double, half } @v_fneg_fp_extend_store_use_fneg_f16_to_f64(half %a) #0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3617,10 +4543,10 @@ define { double, double } @v_fneg_multi_use_fp_extend_fneg_f16_to_f64(half %a) # ; SI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 -; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v3 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64: @@ -3673,11 +4599,11 @@ define { double, double } @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64(h ; SI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v1 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; SI-NEXT: v_xor_b32_e32 v4, 0x80000000, v1 -; SI-NEXT: v_mul_f64 v[2:3], v[0:1], 4.0 -; SI-NEXT: v_mov_b32_e32 v1, v4 +; SI-NEXT: v_mul_f64 v[2:3], v[2:3], 4.0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64: @@ -3820,7 +4746,6 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_fp_round_f64_to_f16: @@ -3972,7 +4897,6 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 { ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_fp_round_fneg_f64_to_f16: @@ -4125,7 +5049,6 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0 ; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v1, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4387,7 +5310,6 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_mul_f64 v[1:2], -v[0:1], v[2:3] ; SI-NEXT: v_mov_b32_e32 v0, v4 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4546,9 +5468,8 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 { ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16: @@ -4761,7 +5682,6 @@ define half @v_fneg_trunc_f16(half %a) #0 { ; SI-LABEL: v_fneg_trunc_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_trunc_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4798,33 +5718,20 @@ define half @v_fneg_trunc_f16(half %a) #0 { ; -------------------------------------------------------------------------------- define half @v_fneg_round_f16(half %a) #0 { -; SI-SAFE-LABEL: v_fneg_round_f16: -; SI-SAFE: ; %bb.0: -; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_trunc_f32_e32 v1, v0 -; SI-SAFE-NEXT: v_sub_f32_e32 v2, v0, v1 -; SI-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] -; SI-SAFE-NEXT: s_brev_b32 s4, -2 -; SI-SAFE-NEXT: v_bfi_b32 v0, s4, v2, v0 -; SI-SAFE-NEXT: v_add_f32_e32 v0, v1, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-NSZ-LABEL: v_fneg_round_f16: -; SI-NSZ: ; %bb.0: -; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_trunc_f32_e32 v1, v0 -; SI-NSZ-NEXT: v_sub_f32_e32 v2, v0, v1 -; SI-NSZ-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 -; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] -; SI-NSZ-NEXT: s_brev_b32 s4, -2 -; SI-NSZ-NEXT: v_bfi_b32 v0, s4, v2, v0 -; SI-NSZ-NEXT: v_sub_f32_e64 v0, -v1, v0 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NSZ-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_fneg_round_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_trunc_f32_e32 v1, v0 +; SI-NEXT: v_sub_f32_e32 v2, v0, v1 +; SI-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] +; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_bfi_b32 v0, s4, v2, v0 +; SI-NEXT: v_add_f32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_round_f16: ; VI-SAFE: ; %bb.0: @@ -4927,7 +5834,6 @@ define half @v_fneg_rint_f16(half %a) #0 { ; SI-LABEL: v_fneg_rint_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_rndne_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4967,7 +5873,6 @@ define half @v_fneg_nearbyint_f16(half %a) #0 { ; SI-LABEL: v_fneg_nearbyint_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_rndne_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -5007,7 +5912,6 @@ define half @v_fneg_sin_f16(half %a) #0 { ; SI-LABEL: v_fneg_sin_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0 ; SI-NEXT: v_fract_f32_e32 v0, v0 @@ -5057,7 +5961,6 @@ define half @v_fneg_canonicalize_f16(half %a) #0 { ; SI-LABEL: v_fneg_canonicalize_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5101,24 +6004,21 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half ; SI-NEXT: v_and_b32_e32 v6, 0x3ff, v31 ; SI-NEXT: v_lshlrev_b32_e32 v6, 1, v6 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_cbranch_execz .LBB81_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f16_f32_e64 v4, -v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e64 v4, -v2 ; SI-NEXT: v_mul_f32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: flat_store_short v[0:1], v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: .LBB81_2: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: flat_store_short v[0:1], v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5235,9 +6135,8 @@ define half @v_fneg_inlineasm_f16(half %a, half %b, half %c, i32 %d) #0 { ; SI-LABEL: v_fneg_inlineasm_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: ;;#ASMSTART @@ -5296,8 +6195,8 @@ define half @v_fneg_inlineasm_multi_use_src_f16(ptr addrspace(1) %out, half %a, ; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 ; SI-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e64 v1, -v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_xor_b32_e32 v1, 0xffff8000, v0 ; SI-NEXT: ;;#ASMSTART ; SI-NEXT: ; use v1 ; SI-NEXT: ;;#ASMEND @@ -5363,12 +6262,94 @@ define { half, half } @multiuse_fneg_2_vop3_users_f16(half %a, half %b, half %c) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_fma_f32 v1, -v0, v1, v2 -; SI-NEXT: v_fma_f32 v2, -v0, v2, 2.0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_cvt_f32_f16_e64 v4, -v0 +; SI-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: s_movk_i32 s5, 0xfc10 +; SI-NEXT: s_movk_i32 s6, 0x40f +; SI-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[0:1] +; SI-NEXT: v_and_b32_e32 v6, 0x1ff, v3 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v3 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v6, 0xffe, v6 +; SI-NEXT: v_bfe_u32 v7, v3, 20, 11 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_sub_i32_e32 v8, vcc, s4, v7 +; SI-NEXT: v_or_b32_e32 v6, 0x1000, v2 +; SI-NEXT: v_med3_i32 v8, v8, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v9, v8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, v8, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v6 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, s5, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 12, v7 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_or_b32_e32 v8, v2, v8 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 +; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; SI-NEXT: v_and_b32_e32 v8, 7, v6 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8 +; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 +; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; SI-NEXT: v_mov_b32_e32 v8, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 +; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; SI-NEXT: v_mov_b32_e32 v9, 0x7e00 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; SI-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v2 +; SI-NEXT: v_fma_f64 v[1:2], v[4:5], v[0:1], 2.0 +; SI-NEXT: v_or_b32_e32 v0, v3, v6 +; SI-NEXT: v_and_b32_e32 v3, 0x1ff, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3 +; SI-NEXT: v_bfe_u32 v4, v2, 20, 11 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_sub_i32_e32 v5, vcc, s4, v4 +; SI-NEXT: v_or_b32_e32 v3, 0x1000, v1 +; SI-NEXT: v_med3_i32 v5, v5, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v6, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, v5, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v3 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, s5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 12, v4 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_or_b32_e32 v5, v1, v5 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 +; SI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; SI-NEXT: v_and_b32_e32 v5, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 +; SI-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: multiuse_fneg_2_vop3_users_f16: @@ -5417,14 +6398,13 @@ define { half, half } @multiuse_fneg_2_vop2_users_f16(half %a, half %b, half %c) ; SI-LABEL: multiuse_fneg_2_vop2_users_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_cvt_f32_f16_e64 v3, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_mul_f32_e32 v1, v0, v1 -; SI-NEXT: v_mul_f32_e32 v2, v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_mul_f32_e32 v0, v3, v1 +; SI-NEXT: v_mul_f32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: multiuse_fneg_2_vop2_users_f16: @@ -5472,14 +6452,57 @@ define { half, half } @multiuse_fneg_vop2_vop3_users_f16(ptr addrspace(1) %out, ; SI-LABEL: multiuse_fneg_vop2_vop3_users_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 -; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_fma_f32 v1, v0, v1, 2.0 -; SI-NEXT: v_mul_f32_e32 v2, v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e64 v5, -v2 +; SI-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v5 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], 2.0 +; SI-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffe, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_sub_i32_e32 v6, vcc, s4, v3 +; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NEXT: v_med3_i32 v6, v6, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v7, v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, v6, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v2 +; SI-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 12, v3 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v6, v0, v6 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; SI-NEXT: v_and_b32_e32 v6, 7, v2 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; SI-NEXT: v_mov_b32_e32 v6, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; SI-NEXT: v_mov_b32_e32 v7, 0x7e00 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_movk_i32 s4, 0x40f +; SI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: multiuse_fneg_vop2_vop3_users_f16: @@ -5527,28 +6550,120 @@ define { half, half } @free_fold_src_code_size_cost_use_f16(ptr addrspace(1) %ou ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-SAFE-NEXT: v_fma_f32 v0, v1, v0, 2.0 -; SI-SAFE-NEXT: v_mul_f32_e64 v1, -v0, v2 -; SI-SAFE-NEXT: v_mul_f32_e64 v2, -v0, v3 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-SAFE-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], 2.0 +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v2, 0xffe, v3 +; SI-SAFE-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_sub_i32_e32 v6, vcc, s4, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-SAFE-NEXT: v_med3_i32 v6, v6, 0, 13 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v7, v6, v2 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v6, v6, v7 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v6, v2 +; SI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SAFE-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-SAFE-NEXT: v_lshlrev_b32_e32 v6, 12, v3 +; SI-SAFE-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-SAFE-NEXT: v_or_b32_e32 v6, v0, v6 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v6, 7, v2 +; SI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-SAFE-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-SAFE-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; SI-SAFE-NEXT: v_mov_b32_e32 v6, 0x7c00 +; SI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; SI-SAFE-NEXT: v_mov_b32_e32 v7, 0x7e00 +; SI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-SAFE-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-SAFE-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-SAFE-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_mul_f32_e32 v1, v1, v2 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: free_fold_src_code_size_cost_use_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NSZ-NEXT: v_fma_f32 v0, v1, -v0, -2.0 -; SI-NSZ-NEXT: v_mul_f32_e32 v1, v0, v2 -; SI-NSZ-NEXT: v_mul_f32_e32 v2, v0, v3 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v0, -v3 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NSZ-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], -2.0 +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v2, 0xffe, v3 +; SI-NSZ-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NSZ-NEXT: v_sub_i32_e32 v6, vcc, s4, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NSZ-NEXT: v_med3_i32 v6, v6, 0, 13 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v7, v6, v2 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v6, v6, v7 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v6, v2 +; SI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NSZ-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NSZ-NEXT: v_lshlrev_b32_e32 v6, 12, v3 +; SI-NSZ-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NSZ-NEXT: v_or_b32_e32 v6, v0, v6 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v6, 7, v2 +; SI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NSZ-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NSZ-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; SI-NSZ-NEXT: v_mov_b32_e32 v6, 0x7c00 +; SI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; SI-NSZ-NEXT: v_mov_b32_e32 v7, 0x7e00 +; SI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; SI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NSZ-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NSZ-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NSZ-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NSZ-NEXT: v_mul_f32_e32 v1, v1, v2 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: free_fold_src_code_size_cost_use_f16: @@ -5620,12 +6735,59 @@ define half @one_use_cost_to_fold_into_src_f16(ptr addrspace(1) %out, half %a, h ; SI-LABEL: one_use_cost_to_fold_into_src_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_trunc_f32_e32 v1, v1 -; SI-NEXT: v_fma_f32 v0, -v1, v2, v0 +; SI-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NEXT: v_trunc_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_cvt_f32_f16_e64 v4, -v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; SI-NEXT: v_bfe_u32 v4, v1, 20, 11 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffe, v3 +; SI-NEXT: v_sub_i32_e32 v3, vcc, s4, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NEXT: v_med3_i32 v3, v3, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v5, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, v3, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v3, v2 +; SI-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NEXT: v_mov_b32_e32 v5, 0x7e00 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_movk_i32 s4, 0x40f +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: one_use_cost_to_fold_into_src_f16: @@ -5670,15 +6832,63 @@ define { half, half } @multi_use_cost_to_fold_into_src(ptr addrspace(1) %out, ha ; SI-LABEL: multi_use_cost_to_fold_into_src: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NEXT: v_trunc_f32_e32 v1, v1 -; SI-NEXT: v_fma_f32 v0, -v1, v2, v0 -; SI-NEXT: v_mul_f32_e32 v1, v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NEXT: v_trunc_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_cvt_f32_f16_e64 v4, -v8 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 +; SI-NEXT: v_fma_f64 v[0:1], v[6:7], v[2:3], v[0:1] +; SI-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_bfe_u32 v4, v1, 20, 11 +; SI-NEXT: v_and_b32_e32 v2, 0xffe, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_sub_i32_e32 v3, vcc, s4, v4 +; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NEXT: v_med3_i32 v3, v3, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v6, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, v3, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v3, v2 +; SI-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v4, v0, v4 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NEXT: v_mov_b32_e32 v6, 0x7e00 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 +; SI-NEXT: s_movk_i32 s4, 0x40f +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, v5, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: multi_use_cost_to_fold_into_src: @@ -5733,27 +6943,115 @@ define <2 x half> @fneg_fma_fneg_dagcombine_loop(<2 x half> %arg, <2 x half> %ar ; SI-LABEL: fneg_fma_fneg_dagcombine_loop: ; SI: ; %bb.0: ; %bb ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; SI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_xor_b32_e32 v7, 0x80008000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_brev_b32 s5, 1 +; SI-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 +; SI-NEXT: s_movk_i32 s6, 0x3f1 +; SI-NEXT: s_movk_i32 s7, 0xfc10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_fma_f64 v[3:4], v[3:4], v[5:6], s[4:5] ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v5, 0x1ff, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v5, 0xffe, v5 +; SI-NEXT: v_bfe_u32 v6, v4, 20, 11 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_sub_i32_e32 v10, vcc, s6, v6 +; SI-NEXT: v_or_b32_e32 v5, 0x1000, v3 +; SI-NEXT: v_med3_i32 v10, v10, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v11, v10, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, v10, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v10, v5 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 12, v6 +; SI-NEXT: v_or_b32_e32 v5, v11, v5 +; SI-NEXT: v_or_b32_e32 v10, v3, v10 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 +; SI-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc +; SI-NEXT: v_and_b32_e32 v10, 7, v5 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10 +; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10 +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; SI-NEXT: v_mov_b32_e32 v10, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 +; SI-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc +; SI-NEXT: v_mov_b32_e32 v11, 0x7e00 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: s_movk_i32 s8, 0x40f +; SI-NEXT: v_cndmask_b32_e32 v3, v10, v11, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v6 +; SI-NEXT: v_cndmask_b32_e32 v12, v5, v3, vcc +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_cvt_f64_f32_e32 v[3:4], v7 +; SI-NEXT: v_cvt_f64_f32_e32 v[5:6], v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v12 +; SI-NEXT: v_fma_f64 v[3:4], v[5:6], v[3:4], s[4:5] +; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 +; SI-NEXT: v_and_b32_e32 v6, 0x1ff, v4 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v4 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v6, 0xffe, v6 +; SI-NEXT: v_bfe_u32 v7, v4, 20, 11 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_sub_i32_e32 v9, vcc, s6, v7 +; SI-NEXT: v_or_b32_e32 v6, 0x1000, v3 +; SI-NEXT: v_med3_i32 v9, v9, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v12, v9, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, v9, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v9, v6 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 12, v7 +; SI-NEXT: v_or_b32_e32 v6, v12, v6 +; SI-NEXT: v_or_b32_e32 v9, v3, v9 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 +; SI-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc +; SI-NEXT: v_and_b32_e32 v9, 7, v6 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v9 +; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 +; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v9, v9, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 +; SI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_cndmask_b32_e32 v3, v10, v11, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v7 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, 1 -; SI-NEXT: v_fma_f32 v5, v5, v7, s4 -; SI-NEXT: v_sub_f32_e32 v4, v5, v4 -; SI-NEXT: v_fma_f32 v1, v1, v2, s4 -; SI-NEXT: v_sub_f32_e32 v0, v1, v0 -; SI-NEXT: v_mul_f32_e32 v1, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_sub_f32_e32 v1, v1, v5 +; SI-NEXT: v_sub_f32_e32 v0, v3, v0 +; SI-NEXT: v_mul_f32_e32 v1, v1, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_mul_f32_e32 v0, v0, v6 +; SI-NEXT: v_mul_f32_e32 v0, v0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -5796,7 +7094,6 @@ define half @nnan_fmul_neg1_to_fneg(half %x, half %y) #0 { ; SI-LABEL: nnan_fmul_neg1_to_fneg: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -5835,7 +7132,6 @@ define half @denormal_fmul_neg1_to_fneg(half %x, half %y) { ; SI-LABEL: denormal_fmul_neg1_to_fneg: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -5873,11 +7169,12 @@ define half @denorm_snan_fmul_neg1_to_fneg(half %x, half %y) { ; SI-LABEL: denorm_snan_fmul_neg1_to_fneg: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v2, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v2, -v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5920,9 +7217,10 @@ define half @flush_snan_fmul_neg1_to_fneg(half %x, half %y) #0 { ; SI-LABEL: flush_snan_fmul_neg1_to_fneg: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5965,12 +7263,11 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) { ; SI-LABEL: fadd_select_fneg_fneg_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; SI-NEXT: v_sub_f32_e32 v0, v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_sub_f32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6023,13 +7320,13 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_sub_f32_e32 v1, v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_sub_f32_e32 v1, v2, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_sub_f32_e32 v0, v2, v0 +; SI-NEXT: v_sub_f32_e32 v0, v3, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll index 5d23f648f707b..410316b1d4d76 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -7777,12 +7777,11 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) { ; SI-LABEL: fadd_select_fneg_fneg_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; SI-NEXT: v_sub_f32_e32 v0, v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_sub_f32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7810,13 +7809,13 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_sub_f32_e32 v1, v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_sub_f32_e32 v1, v2, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_sub_f32_e32 v0, v2, v0 +; SI-NEXT: v_sub_f32_e32 v0, v3, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index afe0b8c3b392b..ca2aa47fbcf5b 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -938,7 +938,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_inv2pi_minnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -960,7 +959,6 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_neg_inv2pi_minnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0x3e230000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1491,7 +1489,6 @@ define half @v_fneg_inv2pi_minimum_f16(half %a) #0 { ; SI-LABEL: v_fneg_inv2pi_minimum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; SI-NEXT: v_max_f32_e32 v2, 0xbe230000, v0 @@ -1518,7 +1515,6 @@ define half @v_fneg_neg_inv2pi_minimum_f16(half %a) #0 { ; SI-LABEL: v_fneg_neg_inv2pi_minimum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; SI-NEXT: v_max_f32_e32 v2, 0x3e230000, v0 @@ -2084,7 +2080,6 @@ define half @v_fneg_inv2pi_minimumnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_inv2pi_minimumnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2106,7 +2101,6 @@ define half @v_fneg_neg_inv2pi_minimumnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_neg_inv2pi_minimumnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0x3e230000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3759,12 +3753,11 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) { ; SI-LABEL: fadd_select_fneg_fneg_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; SI-NEXT: v_sub_f32_e32 v0, v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_sub_f32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3792,13 +3785,13 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_sub_f32_e32 v1, v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_sub_f32_e32 v1, v2, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_sub_f32_e32 v0, v2, v0 +; SI-NEXT: v_sub_f32_e32 v0, v3, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -4112,14 +4105,14 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a ; SI-NEXT: s_load_dword s2, s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_bitcmp1_b32 s2, 16 -; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3] -; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, s[2:3] -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: s_and_b32 s3, 0x10000, s2 +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: s_xor_b32 s2, s2, 0x8000 +; SI-NEXT: s_cmp_lg_u32 s3, 0 +; SI-NEXT: s_cselect_b32 s2, 0, s2 ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: flat_store_short v[0:1], v2 ; SI-NEXT: s_endpgm ; @@ -4149,12 +4142,11 @@ define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) { ; SI-LABEL: v_fneg_select_infloop_regression_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v1, 1, v1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_select_infloop_regression_f16: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index cbd4017c6cf1c..00d53cd265c28 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -94,7 +94,6 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: s_and_b32 s0, s0, 0x7fff ; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; CI-NEXT: v_cvt_f32_f16_e64 v1, -|s0| ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index d9dea4f1fd6e7..9b44acd5c0716 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -601,16 +601,14 @@ define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1 ; GFX7-LABEL: select_fneg_select_f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e64 v2, -v2 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX7-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX7-NEXT: v_xor_b32_e32 v2, 0xffff8000, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, -v0, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: select_fneg_select_f16: @@ -720,37 +718,23 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2 ; GFX7-LABEL: select_fneg_select_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_bfi_b32 v4, s4, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: select_fneg_select_v2f16: @@ -1320,11 +1304,11 @@ define double @fneg_f64_bitcast_build_vector_v4f16_to_f64(half %elt0, half %elt1 ; GFX7-LABEL: fneg_f64_bitcast_build_vector_v4f16_to_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll index cab27fca5ab0a..3140b87c8108e 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -631,17 +631,17 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_load_dword v0, v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_mul_f32_e32 v0, -4.0, v0 -; CI-NEXT: v_sub_f32_e32 v1, 2.0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_mul_f32_e32 v1, -4.0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: flat_store_short v[0:1], v0 -; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_sub_f32_e32 v0, 2.0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: flat_store_short v[0:1], v1 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: flat_store_short v[0:1], v0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_endpgm ; ; GFX8-LABEL: v_extract_fneg_fold_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index 7ff5bbf4821b7..303864ff9434a 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -599,11 +599,11 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_movk_i32 s5, 0x204 +; SI-NEXT: s_mov_b32 s5, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, s5 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_cvt_f32_f16_e64 v0, |s4| +; SI-NEXT: v_cmp_eq_f32_e32 vcc, s5, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -642,16 +642,19 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: s_load_dword s4, s[4:5], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_movk_i32 s5, 0x1f8 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s1, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, s5 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: v_cvt_f32_f16_e64 v1, |s0| +; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; SI-NEXT: v_cmp_neq_f32_e64 s[0:1], s1, v1 +; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_0_f16: @@ -695,11 +698,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_movk_i32 s5, 0x1f8 +; SI-NEXT: s_mov_b32 s5, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, s5 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_cvt_f32_f16_e64 v0, |s4| +; SI-NEXT: v_cmp_lg_f32_e32 vcc, s5, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll index 8df756481e54a..94c2d3364a769 100644 --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -184,12 +184,12 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -681,7 +681,7 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -985,13 +985,13 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v1, |v0| ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e64 v1, |v0|, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; @@ -1087,7 +1087,7 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1189,10 +1189,10 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e64 v1, -|v0|, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SI-NEXT: v_mul_f32_e32 v1, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v0, 0x80000000, v0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll index 3f1aea2e3773d..862c7ac9f762b 100644 --- a/llvm/test/CodeGen/AMDGPU/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/fpow.ll @@ -518,8 +518,8 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) { ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_log_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 @@ -660,23 +660,23 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_log_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_log_f32_e32 v2, v2 -; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v3, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_log_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 +; GFX6-NEXT: v_exp_f32_e32 v2, v2 +; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2 -; GFX6-NEXT: v_exp_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16_fneg_lhs_rhs: diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll index a043d537fbc45..ac269ee0d5abe 100644 --- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -285,8 +285,8 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -384,8 +384,8 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -487,8 +487,8 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; SI-NEXT: v_cvt_i32_f32_e32 v2, v2 ; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index d8660617c7677..9016d4fd67d62 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -1580,7 +1580,11 @@ define half @basic_fract_f16_nonan(half nofpclass(nan) %x) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_floor_f32_e32 v1, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_min_f32_e32 v0, 0x3f7fe000, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1590,7 +1594,11 @@ define half @basic_fract_f16_nonan(half nofpclass(nan) %x) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_floor_f32_e32 v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, 0x3f7fe000, v0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1673,10 +1681,18 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) { ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_floor_f32_e32 v3, v1 -; GFX6-NEXT: v_floor_f32_e32 v2, v0 -; GFX6-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX6-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_floor_f32_e32 v2, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_floor_f32_e32 v3, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_sub_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_min_f32_e32 v1, 0x3f7fe000, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_min_f32_e32 v0, 0x3f7fe000, v0 @@ -1691,10 +1707,18 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) { ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_floor_f32_e32 v3, v1 -; GFX7-NEXT: v_floor_f32_e32 v2, v0 -; GFX7-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX7-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_floor_f32_e32 v2, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_floor_f32_e32 v3, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_sub_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_min_f32_e32 v1, 0x3f7fe000, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_min_f32_e32 v0, 0x3f7fe000, v0 @@ -1859,38 +1883,44 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) writeonly ; GFX6-LABEL: safe_math_fract_f16_noinf_check: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: v_floor_f32_e32 v3, v0 -; GFX6-NEXT: v_sub_f32_e32 v4, v0, v3 -; GFX6-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4 -; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_floor_f32_e32 v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: buffer_store_short v4, v[1:2], s[4:7], 0 addr64 +; GFX6-NEXT: v_sub_f32_e32 v5, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: safe_math_fract_f16_noinf_check: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: v_floor_f32_e32 v3, v0 -; GFX7-NEXT: v_sub_f32_e32 v4, v0, v3 -; GFX7-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4 -; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_floor_f32_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: buffer_store_short v4, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: v_sub_f32_e32 v5, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2486,44 +2516,52 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no ; GFX6-LABEL: safe_math_fract_f16: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX6-NEXT: v_cvt_f32_f16_e64 v6, |v0| +; GFX6-NEXT: s_mov_b32 s8, 0x7f800000 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_floor_f32_e32 v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_floor_f32_e32 v3, v0 -; GFX6-NEXT: v_sub_f32_e32 v4, v0, v3 -; GFX6-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4 -; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GFX6-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_store_short v4, v[1:2], s[4:7], 0 addr64 +; GFX6-NEXT: v_sub_f32_e32 v5, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX6-NEXT: v_cmp_neq_f32_e32 vcc, s8, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: safe_math_fract_f16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cvt_f32_f16_e64 v6, |v0| +; GFX7-NEXT: s_mov_b32 s8, 0x7f800000 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_floor_f32_e32 v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: v_floor_f32_e32 v3, v0 -; GFX7-NEXT: v_sub_f32_e32 v4, v0, v3 -; GFX7-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4 -; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GFX7-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_short v4, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: v_sub_f32_e32 v5, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX7-NEXT: v_cmp_neq_f32_e32 vcc, s8, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2654,38 +2692,45 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: s_movk_i32 s8, 0x7c00 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_floor_f32_e32 v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5 -; GFX6-NEXT: v_floor_f32_e32 v7, v4 -; GFX6-NEXT: v_sub_f32_e32 v5, v3, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_sub_f32_e32 v7, v4, v7 -; GFX6-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 -; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX6-NEXT: v_min_f32_e32 v7, 0x3f7fe000, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX6-NEXT: v_floor_f32_e32 v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_floor_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX6-NEXT: v_sub_f32_e32 v8, v4, v8 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX6-NEXT: v_sub_f32_e32 v9, v5, v9 +; GFX6-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: v_min_f32_e32 v7, 0x3f7fe000, v8 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_min_f32_e32 v8, 0x3f7fe000, v9 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX6-NEXT: buffer_store_dword v6, v[1:2], s[4:7], 0 addr64 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0x7fff, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v0, vcc ; GFX6-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: v_or_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX6-NEXT: buffer_store_dword v4, v[1:2], s[4:7], 0 addr64 +; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -2693,38 +2738,45 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: s_movk_i32 s8, 0x7c00 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_floor_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5 -; GFX7-NEXT: v_floor_f32_e32 v7, v4 -; GFX7-NEXT: v_sub_f32_e32 v5, v3, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_sub_f32_e32 v7, v4, v7 -; GFX7-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 -; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX7-NEXT: v_min_f32_e32 v7, 0x3f7fe000, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX7-NEXT: v_floor_f32_e32 v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_floor_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX7-NEXT: v_sub_f32_e32 v8, v4, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-NEXT: v_sub_f32_e32 v9, v5, v9 +; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: v_min_f32_e32 v7, 0x3f7fe000, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_min_f32_e32 v8, 0x3f7fe000, v9 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-NEXT: buffer_store_dword v6, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0x7fff, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v8, v0, vcc ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_or_b32_e32 v4, v8, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX7-NEXT: buffer_store_dword v4, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll index fcadfcdd087be..2ee95e535b04a 100644 --- a/llvm/test/CodeGen/AMDGPU/freeze.ll +++ b/llvm/test/CodeGen/AMDGPU/freeze.ll @@ -5414,21 +5414,21 @@ define void @freeze_i256(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-GISEL-LABEL: freeze_i256: -; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX8-LABEL: freeze_i256: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: freeze_i256: ; GFX9-GISEL: ; %bb.0: @@ -6156,6 +6156,8 @@ define void @freeze_f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX6-SDAG-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-SDAG-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -6181,6 +6183,8 @@ define void @freeze_f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX7-SDAG-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -6352,6 +6356,15 @@ define void @freeze_v3f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX6-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4 ; GFX6-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -6381,6 +6394,15 @@ define void @freeze_v3f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX7-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4 ; GFX7-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 88f6427d94042..374747ada621b 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -24,37 +24,32 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e64 v4, |v0| ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v0 -; SI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 -; SI-NEXT: v_cmp_ngt_f32_e64 s[0:1], |v0|, |v1| -; SI-NEXT: s_and_b64 vcc, exec, s[0:1] -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v2, |v1| +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v2 ; SI-NEXT: s_cbranch_vccz .LBB0_2 ; SI-NEXT: ; %bb.1: ; %frem.else -; SI-NEXT: v_and_b32_e32 v5, 0x80000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 -; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v0 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB0_3 ; SI-NEXT: s_branch .LBB0_8 ; SI-NEXT: .LBB0_2: -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB0_3: ; %frem.compute ; SI-NEXT: s_mov_b32 s3, 0x7f800000 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s3 -; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v3 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v4|, s3 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v3, v4 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v4 +; SI-NEXT: v_readfirstlane_b32 s0, v3 ; SI-NEXT: s_cselect_b32 s2, s0, 0 -; SI-NEXT: v_frexp_mant_f32_e32 v4, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; SI-NEXT: v_frexp_mant_f32_e32 v3, v4 +; SI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; SI-NEXT: v_ldexp_f32_e64 v3, v3, 11 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v2|, s3 ; SI-NEXT: v_frexp_mant_f32_e32 v4, v2 @@ -111,25 +106,20 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; SI-NEXT: v_ldexp_f32_e64 v2, v2, s0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v4, s0, v2, v0 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0x8000, v0 +; SI-NEXT: v_or_b32_e32 v3, v2, v3 ; SI-NEXT: .LBB0_8: ; %Flow19 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; SI-NEXT: s_mov_b32 s0, 0x7f800000 ; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s0, v0 ; SI-NEXT: s_and_b64 vcc, s[0:1], vcc -; SI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mov_b32_e32 v0, 0x7e00 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -143,37 +133,32 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s4, s10 ; CI-NEXT: s_mov_b32 s5, s11 -; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v4, |v0| ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v0 -; CI-NEXT: v_and_b32_e32 v4, 0x7fffffff, v0 -; CI-NEXT: v_cmp_ngt_f32_e64 s[0:1], |v0|, |v1| -; CI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 -; CI-NEXT: s_and_b64 vcc, exec, s[0:1] +; CI-NEXT: v_cvt_f32_f16_e64 v3, |v1| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3 ; CI-NEXT: s_cbranch_vccz .LBB0_2 ; CI-NEXT: ; %bb.1: ; %frem.else -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_and_b32_e32 v5, 0x80000000, v0 -; CI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v2 -; CI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; CI-NEXT: v_and_b32_e32 v2, 0x8000, v0 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3 +; CI-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; CI-NEXT: s_cbranch_execz .LBB0_3 ; CI-NEXT: s_branch .LBB0_8 ; CI-NEXT: .LBB0_2: -; CI-NEXT: ; implicit-def: $vgpr3 +; CI-NEXT: ; implicit-def: $vgpr2 ; CI-NEXT: .LBB0_3: ; %frem.compute -; CI-NEXT: v_frexp_mant_f32_e32 v3, v4 -; CI-NEXT: v_ldexp_f32_e64 v5, v3, 11 -; CI-NEXT: v_frexp_mant_f32_e32 v3, v2 -; CI-NEXT: v_ldexp_f32_e64 v3, v3, 1 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 +; CI-NEXT: v_frexp_mant_f32_e32 v2, v4 +; CI-NEXT: v_frexp_mant_f32_e32 v4, v3 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3 +; CI-NEXT: v_ldexp_f32_e64 v3, v4, 1 ; CI-NEXT: v_div_scale_f32 v9, s[0:1], v3, v3, 1.0 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v2 +; CI-NEXT: v_ldexp_f32_e64 v5, v2, 11 ; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v8 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 ; CI-NEXT: v_not_b32_e32 v4, v2 ; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CI-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0 @@ -219,25 +204,20 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; CI-NEXT: v_ldexp_f32_e32 v2, v3, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: s_brev_b32 s0, -2 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_bfi_b32 v3, s0, v2, v0 +; CI-NEXT: v_and_b32_e32 v3, 0x8000, v0 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: .LBB0_8: ; %Flow19 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: s_mov_b32 s0, 0x7f800000 -; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_mov_b32 s0, 0x7f800000 +; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v3 ; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s0, v0 ; CI-NEXT: s_and_b64 vcc, s[0:1], vcc -; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; CI-NEXT: s_endpgm ; @@ -1248,84 +1228,181 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: fast_frem_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s0, s2 -; SI-NEXT: s_mov_b32 s1, s3 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 -; SI-NEXT: v_rcp_f32_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_div_scale_f32 v0, vcc, v2, v4, v2 +; SI-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, v2 +; SI-NEXT: v_rcp_f32_e32 v3, v1 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; SI-NEXT: v_fma_f32 v4, v5, v4, v4 -; SI-NEXT: v_mul_f32_e32 v5, v2, v4 -; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 -; SI-NEXT: v_fma_f32 v5, v6, v4, v5 -; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; SI-NEXT: v_fma_f32 v5, -v1, v3, 1.0 +; SI-NEXT: v_fma_f32 v3, v5, v3, v3 +; SI-NEXT: v_mul_f32_e32 v5, v0, v3 +; SI-NEXT: v_fma_f32 v6, -v1, v5, v0 +; SI-NEXT: v_fma_f32 v5, v6, v3, v5 +; SI-NEXT: v_fma_f32 v0, -v1, v5, v0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 -; SI-NEXT: v_trunc_f32_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; SI-NEXT: v_div_fmas_f32 v0, v0, v3, v5 +; SI-NEXT: v_div_fixup_f32 v0, v0, v4, v2 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_trunc_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[2:3] +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b32 s5, s4, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: s_lshr_b32 s5, s4, 8 +; SI-NEXT: s_and_b32 s5, s5, 0xffe +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s6, s5, 0x1000 +; SI-NEXT: s_bfe_u32 s7, s4, 0xb0014 +; SI-NEXT: s_sub_i32 s8, 0x3f1, s7 +; SI-NEXT: v_med3_i32 v0, s8, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: s_lshr_b32 s9, s6, s8 +; SI-NEXT: s_lshl_b32 s8, s9, s8 +; SI-NEXT: s_cmp_lg_u32 s8, s6 +; SI-NEXT: s_cselect_b32 s6, 1, 0 +; SI-NEXT: s_or_b32 s6, s9, s6 +; SI-NEXT: s_addk_i32 s7, 0xfc10 +; SI-NEXT: s_lshl_b32 s8, s7, 12 +; SI-NEXT: s_or_b32 s8, s5, s8 +; SI-NEXT: s_cmp_lt_i32 s7, 1 +; SI-NEXT: s_cselect_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s8, s6, 7 +; SI-NEXT: s_cmp_gt_i32 s8, 5 +; SI-NEXT: s_cselect_b32 s9, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 3 +; SI-NEXT: s_cselect_b32 s8, 1, 0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_lshr_b32 s6, s6, 2 +; SI-NEXT: s_add_i32 s6, s6, s8 +; SI-NEXT: s_cmp_lt_i32 s7, 31 +; SI-NEXT: s_cselect_b32 s6, s6, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: s_movk_i32 s5, 0x7e00 +; SI-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s7, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s6 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: fast_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s6, s10 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s0 -; CI-NEXT: s_mov_b32 s9, s1 -; CI-NEXT: s_mov_b32 s0, s2 -; CI-NEXT: s_mov_b32 s1, s3 -; CI-NEXT: s_mov_b32 s2, s10 -; CI-NEXT: s_mov_b32 s3, s11 -; CI-NEXT: s_mov_b32 s7, s11 -; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s0, s8 +; CI-NEXT: s_mov_b32 s1, s9 +; CI-NEXT: s_mov_b32 s8, s10 +; CI-NEXT: s_mov_b32 s9, s11 +; CI-NEXT: s_mov_b32 s10, s2 +; CI-NEXT: s_mov_b32 s11, s3 +; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 -; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; CI-NEXT: v_rcp_f32_e32 v4, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; CI-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, v2 +; CI-NEXT: v_div_scale_f32 v0, vcc, v2, v4, v2 +; CI-NEXT: v_rcp_f32_e32 v3, v1 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; CI-NEXT: v_fma_f32 v4, v5, v4, v4 -; CI-NEXT: v_mul_f32_e32 v5, v2, v4 -; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 -; CI-NEXT: v_fma_f32 v5, v6, v4, v5 -; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; CI-NEXT: v_fma_f32 v5, -v1, v3, 1.0 +; CI-NEXT: v_fma_f32 v3, v5, v3, v3 +; CI-NEXT: v_mul_f32_e32 v5, v0, v3 +; CI-NEXT: v_fma_f32 v6, -v1, v5, v0 +; CI-NEXT: v_fma_f32 v5, v6, v3, v5 +; CI-NEXT: v_fma_f32 v0, -v1, v5, v0 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; CI-NEXT: v_div_fmas_f32 v0, v0, v3, v5 +; CI-NEXT: v_div_fixup_f32 v0, v0, v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_trunc_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CI-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[2:3] +; CI-NEXT: v_readfirstlane_b32 s4, v1 +; CI-NEXT: s_and_b32 s5, s4, 0x1ff +; CI-NEXT: v_or_b32_e32 v0, s5, v0 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CI-NEXT: s_lshr_b32 s5, s4, 8 +; CI-NEXT: s_bfe_u32 s7, s4, 0xb0014 +; CI-NEXT: s_and_b32 s5, s5, 0xffe +; CI-NEXT: v_readfirstlane_b32 s6, v0 +; CI-NEXT: s_sub_i32 s8, 0x3f1, s7 +; CI-NEXT: s_or_b32 s5, s5, s6 +; CI-NEXT: v_med3_i32 v0, s8, 0, 13 +; CI-NEXT: s_or_b32 s6, s5, 0x1000 +; CI-NEXT: v_readfirstlane_b32 s8, v0 +; CI-NEXT: s_lshr_b32 s9, s6, s8 +; CI-NEXT: s_lshl_b32 s8, s9, s8 +; CI-NEXT: s_cmp_lg_u32 s8, s6 +; CI-NEXT: s_cselect_b32 s6, 1, 0 +; CI-NEXT: s_addk_i32 s7, 0xfc10 +; CI-NEXT: s_lshl_b32 s8, s7, 12 +; CI-NEXT: s_or_b32 s6, s9, s6 +; CI-NEXT: s_or_b32 s8, s5, s8 +; CI-NEXT: s_cmp_lt_i32 s7, 1 +; CI-NEXT: s_cselect_b32 s6, s6, s8 +; CI-NEXT: s_and_b32 s8, s6, 7 +; CI-NEXT: s_cmp_gt_i32 s8, 5 +; CI-NEXT: s_cselect_b32 s9, 1, 0 +; CI-NEXT: s_cmp_eq_u32 s8, 3 +; CI-NEXT: s_cselect_b32 s8, 1, 0 +; CI-NEXT: s_or_b32 s8, s8, s9 +; CI-NEXT: s_lshr_b32 s6, s6, 2 +; CI-NEXT: s_add_i32 s6, s6, s8 +; CI-NEXT: s_cmp_lt_i32 s7, 31 +; CI-NEXT: s_cselect_b32 s6, s6, 0x7c00 +; CI-NEXT: s_cmp_lg_u32 s5, 0 +; CI-NEXT: s_movk_i32 s5, 0x7e00 +; CI-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; CI-NEXT: s_cmpk_eq_i32 s7, 0x40f +; CI-NEXT: s_cselect_b32 s5, s5, s6 +; CI-NEXT: s_lshr_b32 s4, s4, 16 +; CI-NEXT: s_and_b32 s4, s4, 0x8000 +; CI-NEXT: s_or_b32 s4, s4, s5 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: fast_frem_f16: @@ -1649,84 +1726,181 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: unsafe_frem_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s0, s2 -; SI-NEXT: s_mov_b32 s1, s3 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 -; SI-NEXT: v_rcp_f32_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_div_scale_f32 v0, vcc, v2, v4, v2 +; SI-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, v2 +; SI-NEXT: v_rcp_f32_e32 v3, v1 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; SI-NEXT: v_fma_f32 v4, v5, v4, v4 -; SI-NEXT: v_mul_f32_e32 v5, v2, v4 -; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 -; SI-NEXT: v_fma_f32 v5, v6, v4, v5 -; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; SI-NEXT: v_fma_f32 v5, -v1, v3, 1.0 +; SI-NEXT: v_fma_f32 v3, v5, v3, v3 +; SI-NEXT: v_mul_f32_e32 v5, v0, v3 +; SI-NEXT: v_fma_f32 v6, -v1, v5, v0 +; SI-NEXT: v_fma_f32 v5, v6, v3, v5 +; SI-NEXT: v_fma_f32 v0, -v1, v5, v0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 -; SI-NEXT: v_trunc_f32_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; SI-NEXT: v_div_fmas_f32 v0, v0, v3, v5 +; SI-NEXT: v_div_fixup_f32 v0, v0, v4, v2 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_trunc_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[2:3] +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b32 s5, s4, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: s_lshr_b32 s5, s4, 8 +; SI-NEXT: s_and_b32 s5, s5, 0xffe +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s6, s5, 0x1000 +; SI-NEXT: s_bfe_u32 s7, s4, 0xb0014 +; SI-NEXT: s_sub_i32 s8, 0x3f1, s7 +; SI-NEXT: v_med3_i32 v0, s8, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: s_lshr_b32 s9, s6, s8 +; SI-NEXT: s_lshl_b32 s8, s9, s8 +; SI-NEXT: s_cmp_lg_u32 s8, s6 +; SI-NEXT: s_cselect_b32 s6, 1, 0 +; SI-NEXT: s_or_b32 s6, s9, s6 +; SI-NEXT: s_addk_i32 s7, 0xfc10 +; SI-NEXT: s_lshl_b32 s8, s7, 12 +; SI-NEXT: s_or_b32 s8, s5, s8 +; SI-NEXT: s_cmp_lt_i32 s7, 1 +; SI-NEXT: s_cselect_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s8, s6, 7 +; SI-NEXT: s_cmp_gt_i32 s8, 5 +; SI-NEXT: s_cselect_b32 s9, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 3 +; SI-NEXT: s_cselect_b32 s8, 1, 0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_lshr_b32 s6, s6, 2 +; SI-NEXT: s_add_i32 s6, s6, s8 +; SI-NEXT: s_cmp_lt_i32 s7, 31 +; SI-NEXT: s_cselect_b32 s6, s6, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: s_movk_i32 s5, 0x7e00 +; SI-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s7, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s6 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: unsafe_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s6, s10 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s0 -; CI-NEXT: s_mov_b32 s9, s1 -; CI-NEXT: s_mov_b32 s0, s2 -; CI-NEXT: s_mov_b32 s1, s3 -; CI-NEXT: s_mov_b32 s2, s10 -; CI-NEXT: s_mov_b32 s3, s11 -; CI-NEXT: s_mov_b32 s7, s11 -; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s0, s8 +; CI-NEXT: s_mov_b32 s1, s9 +; CI-NEXT: s_mov_b32 s8, s10 +; CI-NEXT: s_mov_b32 s9, s11 +; CI-NEXT: s_mov_b32 s10, s2 +; CI-NEXT: s_mov_b32 s11, s3 +; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 -; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; CI-NEXT: v_rcp_f32_e32 v4, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; CI-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, v2 +; CI-NEXT: v_div_scale_f32 v0, vcc, v2, v4, v2 +; CI-NEXT: v_rcp_f32_e32 v3, v1 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; CI-NEXT: v_fma_f32 v4, v5, v4, v4 -; CI-NEXT: v_mul_f32_e32 v5, v2, v4 -; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 -; CI-NEXT: v_fma_f32 v5, v6, v4, v5 -; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; CI-NEXT: v_fma_f32 v5, -v1, v3, 1.0 +; CI-NEXT: v_fma_f32 v3, v5, v3, v3 +; CI-NEXT: v_mul_f32_e32 v5, v0, v3 +; CI-NEXT: v_fma_f32 v6, -v1, v5, v0 +; CI-NEXT: v_fma_f32 v5, v6, v3, v5 +; CI-NEXT: v_fma_f32 v0, -v1, v5, v0 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 -; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 +; CI-NEXT: v_div_fmas_f32 v0, v0, v3, v5 +; CI-NEXT: v_div_fixup_f32 v0, v0, v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_trunc_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CI-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[2:3] +; CI-NEXT: v_readfirstlane_b32 s4, v1 +; CI-NEXT: s_and_b32 s5, s4, 0x1ff +; CI-NEXT: v_or_b32_e32 v0, s5, v0 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CI-NEXT: s_lshr_b32 s5, s4, 8 +; CI-NEXT: s_bfe_u32 s7, s4, 0xb0014 +; CI-NEXT: s_and_b32 s5, s5, 0xffe +; CI-NEXT: v_readfirstlane_b32 s6, v0 +; CI-NEXT: s_sub_i32 s8, 0x3f1, s7 +; CI-NEXT: s_or_b32 s5, s5, s6 +; CI-NEXT: v_med3_i32 v0, s8, 0, 13 +; CI-NEXT: s_or_b32 s6, s5, 0x1000 +; CI-NEXT: v_readfirstlane_b32 s8, v0 +; CI-NEXT: s_lshr_b32 s9, s6, s8 +; CI-NEXT: s_lshl_b32 s8, s9, s8 +; CI-NEXT: s_cmp_lg_u32 s8, s6 +; CI-NEXT: s_cselect_b32 s6, 1, 0 +; CI-NEXT: s_addk_i32 s7, 0xfc10 +; CI-NEXT: s_lshl_b32 s8, s7, 12 +; CI-NEXT: s_or_b32 s6, s9, s6 +; CI-NEXT: s_or_b32 s8, s5, s8 +; CI-NEXT: s_cmp_lt_i32 s7, 1 +; CI-NEXT: s_cselect_b32 s6, s6, s8 +; CI-NEXT: s_and_b32 s8, s6, 7 +; CI-NEXT: s_cmp_gt_i32 s8, 5 +; CI-NEXT: s_cselect_b32 s9, 1, 0 +; CI-NEXT: s_cmp_eq_u32 s8, 3 +; CI-NEXT: s_cselect_b32 s8, 1, 0 +; CI-NEXT: s_or_b32 s8, s8, s9 +; CI-NEXT: s_lshr_b32 s6, s6, 2 +; CI-NEXT: s_add_i32 s6, s6, s8 +; CI-NEXT: s_cmp_lt_i32 s7, 31 +; CI-NEXT: s_cselect_b32 s6, s6, 0x7c00 +; CI-NEXT: s_cmp_lg_u32 s5, 0 +; CI-NEXT: s_movk_i32 s5, 0x7e00 +; CI-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; CI-NEXT: s_cmpk_eq_i32 s7, 0x40f +; CI-NEXT: s_cselect_b32 s5, s5, s6 +; CI-NEXT: s_lshr_b32 s4, s4, 16 +; CI-NEXT: s_and_b32 s4, s4, 0x8000 +; CI-NEXT: s_or_b32 s4, s4, s5 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: unsafe_frem_f16: @@ -4827,67 +5001,58 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_mov_b32 s5, s11 ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e64 v3, |v0| ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v2 -; SI-NEXT: v_and_b32_e32 v6, 0x7fffffff, v3 -; SI-NEXT: v_cmp_ngt_f32_e64 s[0:1], |v2|, |v3| -; SI-NEXT: s_and_b64 vcc, exec, s[0:1] -; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e64 v4, |v1| +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v4 ; SI-NEXT: s_cbranch_vccz .LBB9_2 ; SI-NEXT: ; %bb.1: ; %frem.else20 -; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v6 -; SI-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; SI-NEXT: v_and_b32_e32 v2, 0xffff8000, v0 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v4 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB9_3 ; SI-NEXT: s_branch .LBB9_8 ; SI-NEXT: .LBB9_2: -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB9_3: ; %frem.compute19 ; SI-NEXT: s_mov_b32 s3, 0x7f800000 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3 -; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v5 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s3 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v2, v3 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v4 +; SI-NEXT: v_readfirstlane_b32 s0, v2 ; SI-NEXT: s_cselect_b32 s2, s0, 0 -; SI-NEXT: v_frexp_mant_f32_e32 v4, v5 -; SI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; SI-NEXT: v_ldexp_f32_e64 v5, v4, 11 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s3 -; SI-NEXT: v_frexp_mant_f32_e32 v4, v6 -; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; SI-NEXT: v_frexp_exp_i32_f32_e32 v6, v6 +; SI-NEXT: v_frexp_mant_f32_e32 v2, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SI-NEXT: v_ldexp_f32_e64 v3, v2, 11 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v4|, s3 +; SI-NEXT: v_frexp_mant_f32_e32 v2, v4 +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v4 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v6 +; SI-NEXT: v_readfirstlane_b32 s0, v4 ; SI-NEXT: s_cselect_b32 s3, s0, 0 ; SI-NEXT: s_add_i32 s0, s3, -1 -; SI-NEXT: v_ldexp_f32_e64 v4, v4, 1 +; SI-NEXT: v_ldexp_f32_e64 v2, v2, 1 ; SI-NEXT: s_not_b32 s1, s0 ; SI-NEXT: s_add_i32 s1, s1, s2 -; SI-NEXT: v_div_scale_f32 v6, vcc, 1.0, v4, 1.0 -; SI-NEXT: v_div_scale_f32 v7, s[4:5], v4, v4, 1.0 -; SI-NEXT: v_rcp_f32_e32 v8, v7 +; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v2, 1.0 +; SI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, 1.0 +; SI-NEXT: v_rcp_f32_e32 v6, v5 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 -; SI-NEXT: v_fma_f32 v8, v9, v8, v8 -; SI-NEXT: v_mul_f32_e32 v9, v6, v8 -; SI-NEXT: v_fma_f32 v10, -v7, v9, v6 -; SI-NEXT: v_fma_f32 v9, v10, v8, v9 -; SI-NEXT: v_fma_f32 v6, -v7, v9, v6 +; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; SI-NEXT: v_fma_f32 v6, v7, v6, v6 +; SI-NEXT: v_mul_f32_e32 v7, v4, v6 +; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 +; SI-NEXT: v_fma_f32 v7, v8, v6, v7 +; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 -; SI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 +; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; SI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 ; SI-NEXT: s_cbranch_scc1 .LBB9_7 ; SI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader @@ -4895,45 +5060,44 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_add_i32 s1, s1, 11 ; SI-NEXT: .LBB9_5: ; %frem.loop_body27 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v7, v5 -; SI-NEXT: v_mul_f32_e32 v5, v7, v6 -; SI-NEXT: v_rndne_f32_e32 v5, v5 -; SI-NEXT: v_fma_f32 v5, -v5, v4, v7 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 -; SI-NEXT: v_add_f32_e32 v8, v5, v4 -; SI-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc -; SI-NEXT: v_ldexp_f32_e64 v5, v5, 11 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mul_f32_e32 v3, v5, v4 +; SI-NEXT: v_rndne_f32_e32 v3, v3 +; SI-NEXT: v_fma_f32 v3, -v3, v2, v5 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; SI-NEXT: v_add_f32_e32 v6, v3, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; SI-NEXT: v_ldexp_f32_e64 v3, v3, 11 ; SI-NEXT: s_add_i32 s1, s1, -11 ; SI-NEXT: s_cmp_gt_i32 s1, 11 ; SI-NEXT: s_cbranch_scc1 .LBB9_5 ; SI-NEXT: ; %bb.6: ; %Flow55 -; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: .LBB9_7: ; %frem.loop_exit28 ; SI-NEXT: s_add_i32 s1, s1, -10 -; SI-NEXT: v_ldexp_f32_e64 v5, v5, s1 -; SI-NEXT: v_mul_f32_e32 v6, v5, v6 -; SI-NEXT: v_rndne_f32_e32 v6, v6 -; SI-NEXT: v_fma_f32 v5, -v6, v4, v5 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 -; SI-NEXT: v_add_f32_e32 v4, v5, v4 -; SI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; SI-NEXT: v_ldexp_f32_e64 v4, v4, s0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v4, s0, v4, v2 -; SI-NEXT: .LBB9_8: -; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 -; SI-NEXT: v_cvt_f32_f16_e64 v6, |v5| -; SI-NEXT: v_cvt_f32_f16_e64 v7, |v7| +; SI-NEXT: v_ldexp_f32_e64 v3, v3, s1 +; SI-NEXT: v_mul_f32_e32 v4, v3, v4 +; SI-NEXT: v_rndne_f32_e32 v4, v4 +; SI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; SI-NEXT: v_add_f32_e32 v2, v3, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SI-NEXT: v_ldexp_f32_e64 v2, v2, s0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff8000, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: .LBB9_8: ; %Flow58 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e64 v6, |v3| +; SI-NEXT: v_cvt_f32_f16_e64 v7, |v4| ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v7 ; SI-NEXT: s_cbranch_vccz .LBB9_10 ; SI-NEXT: ; %bb.9: ; %frem.else -; SI-NEXT: v_and_b32_e32 v8, 0x80000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_and_b32_e32 v5, 0x8000, v3 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, v6, v7 -; SI-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; SI-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB9_11 ; SI-NEXT: s_branch .LBB9_16 @@ -5005,38 +5169,29 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; SI-NEXT: v_ldexp_f32_e64 v5, v5, s0 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v5, s0, v5, v0 +; SI-NEXT: v_and_b32_e32 v5, 0x7fff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0x8000, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: .LBB9_16: ; %Flow54 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; SI-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v2 -; SI-NEXT: s_and_b64 vcc, s[0:1], vcc -; SI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; SI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-NEXT: s_mov_b32 s2, 0x7f800000 ; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v0 ; SI-NEXT: s_and_b64 vcc, s[0:1], vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mov_b32_e32 v0, 0x7e00 +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v2, vcc +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v2 +; SI-NEXT: v_cvt_f32_f16_e64 v2, |v3| +; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v2 +; SI-NEXT: s_and_b64 vcc, s[0:1], vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -5050,101 +5205,91 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s4, s10 ; CI-NEXT: s_mov_b32 s5, s11 -; CI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; CI-NEXT: s_mov_b32 s3, s7 -; CI-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e64 v4, |v0| ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v2 -; CI-NEXT: v_cmp_ngt_f32_e64 s[0:1], |v2|, |v3| -; CI-NEXT: v_and_b32_e32 v6, 0x7fffffff, v2 -; CI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v3 -; CI-NEXT: s_and_b64 vcc, exec, s[0:1] +; CI-NEXT: v_cvt_f32_f16_e64 v3, |v1| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3 ; CI-NEXT: s_cbranch_vccz .LBB9_2 ; CI-NEXT: ; %bb.1: ; %frem.else20 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_and_b32_e32 v7, 0x80000000, v2 -; CI-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5 -; CI-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; CI-NEXT: v_and_b32_e32 v2, 0xffff8000, v0 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3 +; CI-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; CI-NEXT: s_cbranch_execz .LBB9_3 ; CI-NEXT: s_branch .LBB9_8 ; CI-NEXT: .LBB9_2: -; CI-NEXT: ; implicit-def: $vgpr4 +; CI-NEXT: ; implicit-def: $vgpr2 ; CI-NEXT: .LBB9_3: ; %frem.compute19 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v9, v6 -; CI-NEXT: v_frexp_mant_f32_e32 v4, v6 -; CI-NEXT: v_frexp_mant_f32_e32 v6, v5 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v10, v5 -; CI-NEXT: v_ldexp_f32_e64 v5, v6, 1 -; CI-NEXT: v_div_scale_f32 v11, s[0:1], v5, v5, 1.0 -; CI-NEXT: v_ldexp_f32_e64 v7, v4, 11 -; CI-NEXT: v_add_i32_e32 v4, vcc, -1, v10 -; CI-NEXT: v_not_b32_e32 v6, v4 -; CI-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v5, 1.0 -; CI-NEXT: v_rcp_f32_e32 v12, v11 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 +; CI-NEXT: v_frexp_mant_f32_e32 v2, v4 +; CI-NEXT: v_frexp_mant_f32_e32 v4, v3 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3 +; CI-NEXT: v_ldexp_f32_e64 v3, v4, 1 +; CI-NEXT: v_div_scale_f32 v9, s[0:1], v3, v3, 1.0 +; CI-NEXT: v_ldexp_f32_e64 v5, v2, 11 +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v8 +; CI-NEXT: v_not_b32_e32 v4, v2 +; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CI-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0 +; CI-NEXT: v_rcp_f32_e32 v10, v9 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 -; CI-NEXT: v_fma_f32 v12, v13, v12, v12 -; CI-NEXT: v_mul_f32_e32 v13, v8, v12 -; CI-NEXT: v_fma_f32 v14, -v11, v13, v8 -; CI-NEXT: v_fma_f32 v13, v14, v12, v13 -; CI-NEXT: v_fma_f32 v8, -v11, v13, v8 +; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; CI-NEXT: v_fma_f32 v10, v11, v10, v10 +; CI-NEXT: v_mul_f32_e32 v11, v6, v10 +; CI-NEXT: v_fma_f32 v12, -v9, v11, v6 +; CI-NEXT: v_fma_f32 v11, v12, v10, v11 +; CI-NEXT: v_fma_f32 v6, -v9, v11, v6 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v8, v8, v12, v13 -; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6 -; CI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 +; CI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v4 +; CI-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB9_7 ; CI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader -; CI-NEXT: v_sub_i32_e32 v6, vcc, v9, v10 -; CI-NEXT: v_add_i32_e32 v6, vcc, 11, v6 +; CI-NEXT: v_sub_i32_e32 v4, vcc, v7, v8 +; CI-NEXT: v_add_i32_e32 v4, vcc, 11, v4 ; CI-NEXT: .LBB9_5: ; %frem.loop_body27 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 -; CI-NEXT: v_mov_b32_e32 v9, v7 -; CI-NEXT: v_mul_f32_e32 v7, v9, v8 -; CI-NEXT: v_rndne_f32_e32 v7, v7 -; CI-NEXT: v_fma_f32 v7, -v7, v5, v9 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 -; CI-NEXT: v_add_f32_e32 v10, v7, v5 -; CI-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc -; CI-NEXT: v_add_i32_e32 v6, vcc, -11, v6 -; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v6 -; CI-NEXT: v_ldexp_f32_e64 v7, v7, 11 +; CI-NEXT: v_mov_b32_e32 v7, v5 +; CI-NEXT: v_mul_f32_e32 v5, v7, v6 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v5, -v5, v3, v7 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; CI-NEXT: v_add_f32_e32 v8, v5, v3 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, -11, v4 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v4 +; CI-NEXT: v_ldexp_f32_e64 v5, v5, 11 ; CI-NEXT: s_cbranch_vccnz .LBB9_5 ; CI-NEXT: ; %bb.6: ; %Flow55 -; CI-NEXT: v_mov_b32_e32 v7, v9 +; CI-NEXT: v_mov_b32_e32 v5, v7 ; CI-NEXT: .LBB9_7: ; %frem.loop_exit28 -; CI-NEXT: v_add_i32_e32 v6, vcc, -10, v6 -; CI-NEXT: v_ldexp_f32_e32 v6, v7, v6 -; CI-NEXT: v_mul_f32_e32 v7, v6, v8 -; CI-NEXT: v_rndne_f32_e32 v7, v7 -; CI-NEXT: v_fma_f32 v6, -v7, v5, v6 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 -; CI-NEXT: v_add_f32_e32 v5, v6, v5 -; CI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, -10, v4 ; CI-NEXT: v_ldexp_f32_e32 v4, v5, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: s_brev_b32 s0, -2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_bfi_b32 v4, s0, v4, v2 -; CI-NEXT: .LBB9_8: -; CI-NEXT: v_cvt_f16_f32_e32 v5, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v7, |v5| -; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| +; CI-NEXT: v_mul_f32_e32 v5, v4, v6 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v4, -v5, v3, v4 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v3, v4, v3 +; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; CI-NEXT: v_ldexp_f32_e32 v2, v3, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_and_b32_e32 v3, 0xffff8000, v0 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: .LBB9_8: ; %Flow58 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e64 v7, |v3| +; CI-NEXT: v_cvt_f32_f16_e64 v6, |v4| ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 ; CI-NEXT: s_cbranch_vccz .LBB9_10 ; CI-NEXT: ; %bb.9: ; %frem.else -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_and_b32_e32 v8, 0x80000000, v0 +; CI-NEXT: v_and_b32_e32 v5, 0x8000, v3 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v7, v6 -; CI-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; CI-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc ; CI-NEXT: s_cbranch_execz .LBB9_11 ; CI-NEXT: s_branch .LBB9_16 ; CI-NEXT: .LBB9_10: @@ -5203,38 +5348,29 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc ; CI-NEXT: v_ldexp_f32_e32 v5, v6, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: s_brev_b32 s0, -2 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_bfi_b32 v5, s0, v5, v0 +; CI-NEXT: v_and_b32_e32 v6, 0x8000, v3 +; CI-NEXT: v_and_b32_e32 v5, 0x7fff, v5 +; CI-NEXT: v_or_b32_e32 v5, v5, v6 ; CI-NEXT: .LBB9_16: ; %Flow54 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: s_mov_b32 s2, 0x7f800000 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v4 -; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v2 -; CI-NEXT: s_and_b64 vcc, s[0:1], vcc -; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: s_mov_b32 s2, 0x7f800000 ; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc ; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v5 ; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v0 ; CI-NEXT: s_and_b64 vcc, s[0:1], vcc -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 +; CI-NEXT: v_cndmask_b32_e32 v1, v0, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v2 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |v3| +; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v2 +; CI-NEXT: s_and_b64 vcc, s[0:1], vcc +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_or_b32_e32 v0, v3, v0 +; CI-NEXT: v_or_b32_e32 v0, v1, v0 ; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; CI-NEXT: s_endpgm ; @@ -7139,800 +7275,754 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-LABEL: frem_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s10 -; SI-NEXT: s_mov_b32 s5, s11 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s0, s10 +; SI-NEXT: s_mov_b32 s1, s11 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: buffer_load_dwordx2 v[7:8], off, s[0:3], 0 offset:32 +; SI-NEXT: v_readfirstlane_b32 s2, v1 +; SI-NEXT: v_readfirstlane_b32 s3, v0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: v_cvt_f32_f16_e64 v3, |s3| ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_and_b32_e32 v9, 0x7fffffff, v6 -; SI-NEXT: v_and_b32_e32 v10, 0x7fffffff, v7 -; SI-NEXT: v_cmp_ngt_f32_e64 s[0:1], |v6|, |v7| -; SI-NEXT: s_and_b64 vcc, exec, s[0:1] -; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 -; SI-NEXT: s_cbranch_vccz .LBB10_2 +; SI-NEXT: v_cvt_f32_f16_e64 v2, |v0| +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; SI-NEXT: s_cbranch_vccz .LBB10_3 ; SI-NEXT: ; %bb.1: ; %frem.else86 -; SI-NEXT: v_and_b32_e32 v11, 0x80000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cmp_eq_f32_e32 vcc, v9, v10 -; SI-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc +; SI-NEXT: s_and_b32 s4, s3, 0xffff8000 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s4, s4, s3 ; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB10_3 -; SI-NEXT: s_branch .LBB10_8 -; SI-NEXT: .LBB10_2: -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_cbranch_execz .LBB10_4 +; SI-NEXT: ; %bb.2: +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_branch .LBB10_9 +; SI-NEXT: .LBB10_3: +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB10_3: ; %frem.compute85 -; SI-NEXT: s_mov_b32 s3, 0x7f800000 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v9|, s3 -; SI-NEXT: v_frexp_exp_i32_f32_e32 v8, v9 +; SI-NEXT: .LBB10_4: ; %frem.compute85 +; SI-NEXT: s_mov_b32 s5, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s5 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v3 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v8 -; SI-NEXT: s_cselect_b32 s2, s0, 0 -; SI-NEXT: v_frexp_mant_f32_e32 v8, v9 -; SI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; SI-NEXT: v_ldexp_f32_e64 v9, v8, 11 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v10|, s3 -; SI-NEXT: v_frexp_mant_f32_e32 v8, v10 -; SI-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc -; SI-NEXT: v_frexp_exp_i32_f32_e32 v10, v10 +; SI-NEXT: v_readfirstlane_b32 s0, v4 +; SI-NEXT: s_cselect_b32 s4, s0, 0 +; SI-NEXT: v_frexp_mant_f32_e32 v4, v3 +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; SI-NEXT: v_ldexp_f32_e64 v3, v3, 11 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v2|, s5 +; SI-NEXT: v_frexp_mant_f32_e32 v4, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc +; SI-NEXT: v_frexp_exp_i32_f32_e32 v2, v2 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v10 -; SI-NEXT: s_cselect_b32 s3, s0, 0 -; SI-NEXT: s_add_i32 s0, s3, -1 -; SI-NEXT: v_ldexp_f32_e64 v8, v8, 1 +; SI-NEXT: v_readfirstlane_b32 s0, v2 +; SI-NEXT: s_cselect_b32 s5, s0, 0 +; SI-NEXT: s_add_i32 s0, s5, -1 +; SI-NEXT: v_ldexp_f32_e64 v2, v4, 1 ; SI-NEXT: s_not_b32 s1, s0 -; SI-NEXT: s_add_i32 s1, s1, s2 -; SI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v8, 1.0 -; SI-NEXT: v_div_scale_f32 v11, s[4:5], v8, v8, 1.0 -; SI-NEXT: v_rcp_f32_e32 v12, v11 +; SI-NEXT: s_add_i32 s1, s1, s4 +; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v2, 1.0 +; SI-NEXT: v_div_scale_f32 v5, s[6:7], v2, v2, 1.0 +; SI-NEXT: v_rcp_f32_e32 v6, v5 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 -; SI-NEXT: v_fma_f32 v12, v13, v12, v12 -; SI-NEXT: v_mul_f32_e32 v13, v10, v12 -; SI-NEXT: v_fma_f32 v14, -v11, v13, v10 -; SI-NEXT: v_fma_f32 v13, v14, v12, v13 -; SI-NEXT: v_fma_f32 v10, -v11, v13, v10 +; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; SI-NEXT: v_fma_f32 v6, v7, v6, v6 +; SI-NEXT: v_mul_f32_e32 v7, v4, v6 +; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 +; SI-NEXT: v_fma_f32 v7, v8, v6, v7 +; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v10, v10, v12, v13 -; SI-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0 +; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; SI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 -; SI-NEXT: s_cbranch_scc1 .LBB10_7 -; SI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader -; SI-NEXT: s_sub_i32 s1, s2, s3 +; SI-NEXT: s_cbranch_scc1 .LBB10_8 +; SI-NEXT: ; %bb.5: ; %frem.loop_body93.preheader +; SI-NEXT: s_sub_i32 s1, s4, s5 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB10_5: ; %frem.loop_body93 +; SI-NEXT: .LBB10_6: ; %frem.loop_body93 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v11, v9 -; SI-NEXT: v_mul_f32_e32 v9, v11, v10 -; SI-NEXT: v_rndne_f32_e32 v9, v9 -; SI-NEXT: v_fma_f32 v9, -v9, v8, v11 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v9 -; SI-NEXT: v_add_f32_e32 v12, v9, v8 -; SI-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc -; SI-NEXT: v_ldexp_f32_e64 v9, v9, 11 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mul_f32_e32 v3, v5, v4 +; SI-NEXT: v_rndne_f32_e32 v3, v3 +; SI-NEXT: v_fma_f32 v3, -v3, v2, v5 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; SI-NEXT: v_add_f32_e32 v6, v3, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; SI-NEXT: v_ldexp_f32_e64 v3, v3, 11 ; SI-NEXT: s_add_i32 s1, s1, -11 ; SI-NEXT: s_cmp_gt_i32 s1, 11 -; SI-NEXT: s_cbranch_scc1 .LBB10_5 -; SI-NEXT: ; %bb.6: ; %Flow133 -; SI-NEXT: v_mov_b32_e32 v9, v11 -; SI-NEXT: .LBB10_7: ; %frem.loop_exit94 +; SI-NEXT: s_cbranch_scc1 .LBB10_6 +; SI-NEXT: ; %bb.7: ; %Flow133 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: .LBB10_8: ; %frem.loop_exit94 ; SI-NEXT: s_add_i32 s1, s1, -10 -; SI-NEXT: v_ldexp_f32_e64 v9, v9, s1 -; SI-NEXT: v_mul_f32_e32 v10, v9, v10 -; SI-NEXT: v_rndne_f32_e32 v10, v10 -; SI-NEXT: v_fma_f32 v9, -v10, v8, v9 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v9 -; SI-NEXT: v_add_f32_e32 v8, v9, v8 -; SI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; SI-NEXT: v_ldexp_f32_e64 v8, v8, s0 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v8, s0, v8, v6 -; SI-NEXT: .LBB10_8: -; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v5 -; SI-NEXT: v_cvt_f32_f16_e64 v10, |v9| -; SI-NEXT: v_cvt_f32_f16_e64 v11, |v11| -; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v10, v11 -; SI-NEXT: s_cbranch_vccz .LBB10_10 -; SI-NEXT: ; %bb.9: ; %frem.else53 -; SI-NEXT: v_and_b32_e32 v12, 0x80000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cmp_eq_f32_e32 vcc, v10, v11 -; SI-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc +; SI-NEXT: v_ldexp_f32_e64 v3, v3, s1 +; SI-NEXT: v_mul_f32_e32 v4, v3, v4 +; SI-NEXT: v_rndne_f32_e32 v4, v4 +; SI-NEXT: v_fma_f32 v3, -v4, v2, v3 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3 +; SI-NEXT: v_add_f32_e32 v2, v3, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SI-NEXT: v_ldexp_f32_e64 v2, v2, s0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; SI-NEXT: s_and_b32 s0, s3, 0xffff8000 +; SI-NEXT: v_or_b32_e32 v3, s0, v2 +; SI-NEXT: .LBB10_9: ; %Flow136 +; SI-NEXT: s_lshr_b32 s4, s2, 16 +; SI-NEXT: s_lshr_b32 s5, s3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: ; %bb.10: +; SI-NEXT: v_cvt_f32_f16_e64 v6, |s5| +; SI-NEXT: v_cvt_f32_f16_e64 v5, |v4| +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v5 +; SI-NEXT: s_cbranch_vccz .LBB10_13 +; SI-NEXT: ; %bb.11: ; %frem.else53 +; SI-NEXT: s_and_b32 s6, s5, 0x8000 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s6, s6, s5 ; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB10_11 -; SI-NEXT: s_branch .LBB10_16 -; SI-NEXT: .LBB10_10: -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_cbranch_execz .LBB10_14 +; SI-NEXT: ; %bb.12: +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: s_branch .LBB10_19 +; SI-NEXT: .LBB10_13: +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB10_11: ; %frem.compute52 -; SI-NEXT: s_mov_b32 s3, 0x7f800000 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v10|, s3 -; SI-NEXT: v_frexp_exp_i32_f32_e32 v9, v10 +; SI-NEXT: .LBB10_14: ; %frem.compute52 +; SI-NEXT: s_mov_b32 s7, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s7 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v7, v6 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v9 -; SI-NEXT: s_cselect_b32 s2, s0, 0 -; SI-NEXT: v_frexp_mant_f32_e32 v9, v10 -; SI-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc -; SI-NEXT: v_ldexp_f32_e64 v10, v9, 11 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v11|, s3 -; SI-NEXT: v_frexp_mant_f32_e32 v9, v11 -; SI-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc -; SI-NEXT: v_frexp_exp_i32_f32_e32 v11, v11 +; SI-NEXT: v_readfirstlane_b32 s0, v7 +; SI-NEXT: s_cselect_b32 s6, s0, 0 +; SI-NEXT: v_frexp_mant_f32_e32 v7, v6 +; SI-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; SI-NEXT: v_ldexp_f32_e64 v6, v6, 11 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s7 +; SI-NEXT: v_frexp_mant_f32_e32 v7, v5 +; SI-NEXT: v_cndmask_b32_e32 v7, v5, v7, vcc +; SI-NEXT: v_frexp_exp_i32_f32_e32 v5, v5 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v11 -; SI-NEXT: s_cselect_b32 s3, s0, 0 -; SI-NEXT: s_add_i32 s0, s3, -1 -; SI-NEXT: v_ldexp_f32_e64 v9, v9, 1 +; SI-NEXT: v_readfirstlane_b32 s0, v5 +; SI-NEXT: s_cselect_b32 s7, s0, 0 +; SI-NEXT: s_add_i32 s0, s7, -1 +; SI-NEXT: v_ldexp_f32_e64 v5, v7, 1 ; SI-NEXT: s_not_b32 s1, s0 -; SI-NEXT: s_add_i32 s1, s1, s2 -; SI-NEXT: v_div_scale_f32 v11, vcc, 1.0, v9, 1.0 -; SI-NEXT: v_div_scale_f32 v12, s[4:5], v9, v9, 1.0 -; SI-NEXT: v_rcp_f32_e32 v13, v12 +; SI-NEXT: s_add_i32 s1, s1, s6 +; SI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v5, 1.0 +; SI-NEXT: v_div_scale_f32 v8, s[10:11], v5, v5, 1.0 +; SI-NEXT: v_rcp_f32_e32 v9, v8 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v14, -v12, v13, 1.0 -; SI-NEXT: v_fma_f32 v13, v14, v13, v13 -; SI-NEXT: v_mul_f32_e32 v14, v11, v13 -; SI-NEXT: v_fma_f32 v15, -v12, v14, v11 -; SI-NEXT: v_fma_f32 v14, v15, v13, v14 -; SI-NEXT: v_fma_f32 v11, -v12, v14, v11 +; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 +; SI-NEXT: v_fma_f32 v9, v10, v9, v9 +; SI-NEXT: v_mul_f32_e32 v10, v7, v9 +; SI-NEXT: v_fma_f32 v11, -v8, v10, v7 +; SI-NEXT: v_fma_f32 v10, v11, v9, v10 +; SI-NEXT: v_fma_f32 v7, -v8, v10, v7 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v11, v11, v13, v14 -; SI-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 +; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 +; SI-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 -; SI-NEXT: s_cbranch_scc1 .LBB10_15 -; SI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader -; SI-NEXT: s_sub_i32 s1, s2, s3 +; SI-NEXT: s_cbranch_scc1 .LBB10_18 +; SI-NEXT: ; %bb.15: ; %frem.loop_body60.preheader +; SI-NEXT: s_sub_i32 s1, s6, s7 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB10_13: ; %frem.loop_body60 +; SI-NEXT: .LBB10_16: ; %frem.loop_body60 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v12, v10 -; SI-NEXT: v_mul_f32_e32 v10, v12, v11 -; SI-NEXT: v_rndne_f32_e32 v10, v10 -; SI-NEXT: v_fma_f32 v10, -v10, v9, v12 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v10 -; SI-NEXT: v_add_f32_e32 v13, v10, v9 -; SI-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; SI-NEXT: v_ldexp_f32_e64 v10, v10, 11 -; SI-NEXT: s_add_i32 s1, s1, -11 -; SI-NEXT: s_cmp_gt_i32 s1, 11 -; SI-NEXT: s_cbranch_scc1 .LBB10_13 -; SI-NEXT: ; %bb.14: ; %Flow129 -; SI-NEXT: v_mov_b32_e32 v10, v12 -; SI-NEXT: .LBB10_15: ; %frem.loop_exit61 -; SI-NEXT: s_add_i32 s1, s1, -10 -; SI-NEXT: v_ldexp_f32_e64 v10, v10, s1 -; SI-NEXT: v_mul_f32_e32 v11, v10, v11 -; SI-NEXT: v_rndne_f32_e32 v11, v11 -; SI-NEXT: v_fma_f32 v10, -v11, v9, v10 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v10 -; SI-NEXT: v_add_f32_e32 v9, v10, v9 -; SI-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc -; SI-NEXT: v_ldexp_f32_e64 v9, v9, s0 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v9, s0, v9, v4 -; SI-NEXT: .LBB10_16: -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 -; SI-NEXT: v_cvt_f32_f16_e64 v11, |v10| -; SI-NEXT: v_cvt_f32_f16_e64 v12, |v12| -; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v12 -; SI-NEXT: s_cbranch_vccz .LBB10_18 -; SI-NEXT: ; %bb.17: ; %frem.else20 -; SI-NEXT: v_and_b32_e32 v13, 0x80000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cmp_eq_f32_e32 vcc, v11, v12 -; SI-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; SI-NEXT: v_mov_b32_e32 v8, v6 +; SI-NEXT: v_mul_f32_e32 v6, v8, v7 +; SI-NEXT: v_rndne_f32_e32 v6, v6 +; SI-NEXT: v_fma_f32 v6, -v6, v5, v8 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; SI-NEXT: v_add_f32_e32 v9, v6, v5 +; SI-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; SI-NEXT: v_ldexp_f32_e64 v6, v6, 11 +; SI-NEXT: s_add_i32 s1, s1, -11 +; SI-NEXT: s_cmp_gt_i32 s1, 11 +; SI-NEXT: s_cbranch_scc1 .LBB10_16 +; SI-NEXT: ; %bb.17: ; %Flow129 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: .LBB10_18: ; %frem.loop_exit61 +; SI-NEXT: s_add_i32 s1, s1, -10 +; SI-NEXT: v_ldexp_f32_e64 v6, v6, s1 +; SI-NEXT: v_mul_f32_e32 v7, v6, v7 +; SI-NEXT: v_rndne_f32_e32 v7, v7 +; SI-NEXT: v_fma_f32 v6, -v7, v5, v6 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6 +; SI-NEXT: v_add_f32_e32 v5, v6, v5 +; SI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; SI-NEXT: v_ldexp_f32_e64 v5, v5, s0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_and_b32_e32 v5, 0x7fff, v5 +; SI-NEXT: s_and_b32 s0, s5, 0x8000 +; SI-NEXT: v_or_b32_e32 v5, s0, v5 +; SI-NEXT: .LBB10_19: +; SI-NEXT: v_cvt_f32_f16_e64 v7, |s2| +; SI-NEXT: v_cvt_f32_f16_e64 v6, |v1| +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 +; SI-NEXT: s_cbranch_vccz .LBB10_22 +; SI-NEXT: ; %bb.20: ; %frem.else20 +; SI-NEXT: s_and_b32 s6, s2, 0xffff8000 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v7, v6 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s6, s6, s2 ; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB10_19 -; SI-NEXT: s_branch .LBB10_24 -; SI-NEXT: .LBB10_18: -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_cbranch_execz .LBB10_23 +; SI-NEXT: ; %bb.21: +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: s_branch .LBB10_28 +; SI-NEXT: .LBB10_22: +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB10_19: ; %frem.compute19 -; SI-NEXT: s_mov_b32 s3, 0x7f800000 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v11|, s3 -; SI-NEXT: v_frexp_exp_i32_f32_e32 v10, v11 +; SI-NEXT: .LBB10_23: ; %frem.compute19 +; SI-NEXT: s_mov_b32 s7, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s7 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v8, v7 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v10 -; SI-NEXT: s_cselect_b32 s2, s0, 0 -; SI-NEXT: v_frexp_mant_f32_e32 v10, v11 -; SI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc -; SI-NEXT: v_ldexp_f32_e64 v11, v10, 11 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v12|, s3 -; SI-NEXT: v_frexp_mant_f32_e32 v10, v12 -; SI-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc -; SI-NEXT: v_frexp_exp_i32_f32_e32 v12, v12 +; SI-NEXT: v_readfirstlane_b32 s0, v8 +; SI-NEXT: s_cselect_b32 s6, s0, 0 +; SI-NEXT: v_frexp_mant_f32_e32 v8, v7 +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; SI-NEXT: v_ldexp_f32_e64 v7, v7, 11 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s7 +; SI-NEXT: v_frexp_mant_f32_e32 v8, v6 +; SI-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc +; SI-NEXT: v_frexp_exp_i32_f32_e32 v6, v6 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v12 -; SI-NEXT: s_cselect_b32 s3, s0, 0 -; SI-NEXT: s_add_i32 s0, s3, -1 -; SI-NEXT: v_ldexp_f32_e64 v10, v10, 1 +; SI-NEXT: v_readfirstlane_b32 s0, v6 +; SI-NEXT: s_cselect_b32 s7, s0, 0 +; SI-NEXT: s_add_i32 s0, s7, -1 +; SI-NEXT: v_ldexp_f32_e64 v6, v8, 1 ; SI-NEXT: s_not_b32 s1, s0 -; SI-NEXT: s_add_i32 s1, s1, s2 -; SI-NEXT: v_div_scale_f32 v12, vcc, 1.0, v10, 1.0 -; SI-NEXT: v_div_scale_f32 v13, s[4:5], v10, v10, 1.0 -; SI-NEXT: v_rcp_f32_e32 v14, v13 +; SI-NEXT: s_add_i32 s1, s1, s6 +; SI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v6, 1.0 +; SI-NEXT: v_div_scale_f32 v9, s[10:11], v6, v6, 1.0 +; SI-NEXT: v_rcp_f32_e32 v10, v9 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v15, -v13, v14, 1.0 -; SI-NEXT: v_fma_f32 v14, v15, v14, v14 -; SI-NEXT: v_mul_f32_e32 v15, v12, v14 -; SI-NEXT: v_fma_f32 v16, -v13, v15, v12 -; SI-NEXT: v_fma_f32 v15, v16, v14, v15 -; SI-NEXT: v_fma_f32 v12, -v13, v15, v12 +; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; SI-NEXT: v_fma_f32 v10, v11, v10, v10 +; SI-NEXT: v_mul_f32_e32 v11, v8, v10 +; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 +; SI-NEXT: v_fma_f32 v11, v12, v10, v11 +; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v12, v12, v14, v15 -; SI-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0 +; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 +; SI-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 -; SI-NEXT: s_cbranch_scc1 .LBB10_23 -; SI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader -; SI-NEXT: s_sub_i32 s1, s2, s3 +; SI-NEXT: s_cbranch_scc1 .LBB10_27 +; SI-NEXT: ; %bb.24: ; %frem.loop_body27.preheader +; SI-NEXT: s_sub_i32 s1, s6, s7 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB10_21: ; %frem.loop_body27 +; SI-NEXT: .LBB10_25: ; %frem.loop_body27 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v13, v11 -; SI-NEXT: v_mul_f32_e32 v11, v13, v12 -; SI-NEXT: v_rndne_f32_e32 v11, v11 -; SI-NEXT: v_fma_f32 v11, -v11, v10, v13 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 -; SI-NEXT: v_add_f32_e32 v14, v11, v10 -; SI-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; SI-NEXT: v_ldexp_f32_e64 v11, v11, 11 +; SI-NEXT: v_mov_b32_e32 v9, v7 +; SI-NEXT: v_mul_f32_e32 v7, v9, v8 +; SI-NEXT: v_rndne_f32_e32 v7, v7 +; SI-NEXT: v_fma_f32 v7, -v7, v6, v9 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; SI-NEXT: v_add_f32_e32 v10, v7, v6 +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; SI-NEXT: v_ldexp_f32_e64 v7, v7, 11 ; SI-NEXT: s_add_i32 s1, s1, -11 ; SI-NEXT: s_cmp_gt_i32 s1, 11 -; SI-NEXT: s_cbranch_scc1 .LBB10_21 -; SI-NEXT: ; %bb.22: ; %Flow125 -; SI-NEXT: v_mov_b32_e32 v11, v13 -; SI-NEXT: .LBB10_23: ; %frem.loop_exit28 +; SI-NEXT: s_cbranch_scc1 .LBB10_25 +; SI-NEXT: ; %bb.26: ; %Flow125 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: .LBB10_27: ; %frem.loop_exit28 ; SI-NEXT: s_add_i32 s1, s1, -10 -; SI-NEXT: v_ldexp_f32_e64 v11, v11, s1 -; SI-NEXT: v_mul_f32_e32 v12, v11, v12 -; SI-NEXT: v_rndne_f32_e32 v12, v12 -; SI-NEXT: v_fma_f32 v11, -v12, v10, v11 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 -; SI-NEXT: v_add_f32_e32 v10, v11, v10 -; SI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc -; SI-NEXT: v_ldexp_f32_e64 v10, v10, s0 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v10, s0, v10, v2 -; SI-NEXT: .LBB10_24: -; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 -; SI-NEXT: v_cvt_f32_f16_e64 v12, |v11| -; SI-NEXT: v_cvt_f32_f16_e64 v13, |v13| -; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v12, v13 -; SI-NEXT: s_cbranch_vccz .LBB10_26 -; SI-NEXT: ; %bb.25: ; %frem.else -; SI-NEXT: v_and_b32_e32 v14, 0x80000000, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cmp_eq_f32_e32 vcc, v12, v13 -; SI-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc +; SI-NEXT: v_ldexp_f32_e64 v7, v7, s1 +; SI-NEXT: v_mul_f32_e32 v8, v7, v8 +; SI-NEXT: v_rndne_f32_e32 v8, v8 +; SI-NEXT: v_fma_f32 v7, -v8, v6, v7 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; SI-NEXT: v_add_f32_e32 v6, v7, v6 +; SI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; SI-NEXT: v_ldexp_f32_e64 v6, v6, s0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0x7fff, v6 +; SI-NEXT: s_and_b32 s0, s2, 0xffff8000 +; SI-NEXT: v_or_b32_e32 v6, s0, v6 +; SI-NEXT: .LBB10_28: +; SI-NEXT: v_cvt_f32_f16_e64 v8, |s4| +; SI-NEXT: v_cvt_f32_f16_e64 v7, |v2| +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v8, v7 +; SI-NEXT: s_cbranch_vccz .LBB10_31 +; SI-NEXT: ; %bb.29: ; %frem.else +; SI-NEXT: s_and_b32 s6, s4, 0x8000 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v8, v7 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s6, s6, s4 ; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB10_27 -; SI-NEXT: s_branch .LBB10_32 -; SI-NEXT: .LBB10_26: -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_cbranch_execz .LBB10_32 +; SI-NEXT: ; %bb.30: +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_branch .LBB10_37 +; SI-NEXT: .LBB10_31: +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB10_27: ; %frem.compute -; SI-NEXT: s_mov_b32 s3, 0x7f800000 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v12|, s3 -; SI-NEXT: v_frexp_exp_i32_f32_e32 v11, v12 +; SI-NEXT: .LBB10_32: ; %frem.compute +; SI-NEXT: s_mov_b32 s7, 0x7f800000 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v8|, s7 +; SI-NEXT: v_frexp_exp_i32_f32_e32 v9, v8 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v11 -; SI-NEXT: s_cselect_b32 s2, s0, 0 -; SI-NEXT: v_frexp_mant_f32_e32 v11, v12 -; SI-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc -; SI-NEXT: v_ldexp_f32_e64 v12, v11, 11 -; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v13|, s3 -; SI-NEXT: v_frexp_mant_f32_e32 v11, v13 -; SI-NEXT: v_cndmask_b32_e32 v11, v13, v11, vcc -; SI-NEXT: v_frexp_exp_i32_f32_e32 v13, v13 +; SI-NEXT: v_readfirstlane_b32 s0, v9 +; SI-NEXT: s_cselect_b32 s6, s0, 0 +; SI-NEXT: v_frexp_mant_f32_e32 v9, v8 +; SI-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc +; SI-NEXT: v_ldexp_f32_e64 v8, v8, 11 +; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s7 +; SI-NEXT: v_frexp_mant_f32_e32 v9, v7 +; SI-NEXT: v_cndmask_b32_e32 v9, v7, v9, vcc +; SI-NEXT: v_frexp_exp_i32_f32_e32 v7, v7 ; SI-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s0, v13 -; SI-NEXT: s_cselect_b32 s3, s0, 0 -; SI-NEXT: s_add_i32 s0, s3, -1 -; SI-NEXT: v_ldexp_f32_e64 v11, v11, 1 +; SI-NEXT: v_readfirstlane_b32 s0, v7 +; SI-NEXT: s_cselect_b32 s7, s0, 0 +; SI-NEXT: s_add_i32 s0, s7, -1 +; SI-NEXT: v_ldexp_f32_e64 v7, v9, 1 ; SI-NEXT: s_not_b32 s1, s0 -; SI-NEXT: s_add_i32 s1, s1, s2 -; SI-NEXT: v_div_scale_f32 v13, vcc, 1.0, v11, 1.0 -; SI-NEXT: v_div_scale_f32 v14, s[4:5], v11, v11, 1.0 -; SI-NEXT: v_rcp_f32_e32 v15, v14 +; SI-NEXT: s_add_i32 s1, s1, s6 +; SI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v7, 1.0 +; SI-NEXT: v_div_scale_f32 v10, s[10:11], v7, v7, 1.0 +; SI-NEXT: v_rcp_f32_e32 v11, v10 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v16, -v14, v15, 1.0 -; SI-NEXT: v_fma_f32 v15, v16, v15, v15 -; SI-NEXT: v_mul_f32_e32 v16, v13, v15 -; SI-NEXT: v_fma_f32 v17, -v14, v16, v13 -; SI-NEXT: v_fma_f32 v16, v17, v15, v16 -; SI-NEXT: v_fma_f32 v13, -v14, v16, v13 +; SI-NEXT: v_fma_f32 v12, -v10, v11, 1.0 +; SI-NEXT: v_fma_f32 v11, v12, v11, v11 +; SI-NEXT: v_mul_f32_e32 v12, v9, v11 +; SI-NEXT: v_fma_f32 v13, -v10, v12, v9 +; SI-NEXT: v_fma_f32 v12, v13, v11, v12 +; SI-NEXT: v_fma_f32 v9, -v10, v12, v9 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v13, v13, v15, v16 -; SI-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0 +; SI-NEXT: v_div_fmas_f32 v9, v9, v11, v12 +; SI-NEXT: v_div_fixup_f32 v9, v9, v7, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 -; SI-NEXT: s_cbranch_scc1 .LBB10_31 -; SI-NEXT: ; %bb.28: ; %frem.loop_body.preheader -; SI-NEXT: s_sub_i32 s1, s2, s3 +; SI-NEXT: s_cbranch_scc1 .LBB10_36 +; SI-NEXT: ; %bb.33: ; %frem.loop_body.preheader +; SI-NEXT: s_sub_i32 s1, s6, s7 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB10_29: ; %frem.loop_body +; SI-NEXT: .LBB10_34: ; %frem.loop_body ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v14, v12 -; SI-NEXT: v_mul_f32_e32 v12, v14, v13 -; SI-NEXT: v_rndne_f32_e32 v12, v12 -; SI-NEXT: v_fma_f32 v12, -v12, v11, v14 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 -; SI-NEXT: v_add_f32_e32 v15, v12, v11 -; SI-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc -; SI-NEXT: v_ldexp_f32_e64 v12, v12, 11 +; SI-NEXT: v_mov_b32_e32 v10, v8 +; SI-NEXT: v_mul_f32_e32 v8, v10, v9 +; SI-NEXT: v_rndne_f32_e32 v8, v8 +; SI-NEXT: v_fma_f32 v8, -v8, v7, v10 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v8 +; SI-NEXT: v_add_f32_e32 v11, v8, v7 +; SI-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc +; SI-NEXT: v_ldexp_f32_e64 v8, v8, 11 ; SI-NEXT: s_add_i32 s1, s1, -11 ; SI-NEXT: s_cmp_gt_i32 s1, 11 -; SI-NEXT: s_cbranch_scc1 .LBB10_29 -; SI-NEXT: ; %bb.30: ; %Flow -; SI-NEXT: v_mov_b32_e32 v12, v14 -; SI-NEXT: .LBB10_31: ; %frem.loop_exit +; SI-NEXT: s_cbranch_scc1 .LBB10_34 +; SI-NEXT: ; %bb.35: ; %Flow +; SI-NEXT: v_mov_b32_e32 v8, v10 +; SI-NEXT: .LBB10_36: ; %frem.loop_exit ; SI-NEXT: s_add_i32 s1, s1, -10 -; SI-NEXT: v_ldexp_f32_e64 v12, v12, s1 -; SI-NEXT: v_mul_f32_e32 v13, v12, v13 -; SI-NEXT: v_rndne_f32_e32 v13, v13 -; SI-NEXT: v_fma_f32 v12, -v13, v11, v12 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 -; SI-NEXT: v_add_f32_e32 v11, v12, v11 -; SI-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc -; SI-NEXT: v_ldexp_f32_e64 v11, v11, s0 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v11, s0, v11, v0 -; SI-NEXT: .LBB10_32: ; %Flow124 +; SI-NEXT: v_ldexp_f32_e64 v8, v8, s1 +; SI-NEXT: v_mul_f32_e32 v9, v8, v9 +; SI-NEXT: v_rndne_f32_e32 v9, v9 +; SI-NEXT: v_fma_f32 v8, -v9, v7, v8 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v8 +; SI-NEXT: v_add_f32_e32 v7, v8, v7 +; SI-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; SI-NEXT: v_ldexp_f32_e64 v7, v7, s0 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e64 v6, |v6| -; SI-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v6 +; SI-NEXT: v_and_b32_e32 v7, 0x7fff, v7 +; SI-NEXT: s_and_b32 s0, s4, 0x8000 +; SI-NEXT: v_or_b32_e32 v7, s0, v7 +; SI-NEXT: .LBB10_37: ; %Flow124 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, |s3| +; SI-NEXT: s_mov_b32 s3, 0x7f800000 +; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s3, v0 ; SI-NEXT: s_and_b64 vcc, s[0:1], vcc -; SI-NEXT: v_mov_b32_e32 v6, 0x7fc00000 -; SI-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e64 v4, |v4| -; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v4 +; SI-NEXT: v_mov_b32_e32 v0, 0x7e00 +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v4 +; SI-NEXT: v_cvt_f32_f16_e64 v4, |s5| +; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s3, v4 ; SI-NEXT: s_and_b64 vcc, s[0:1], vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, v0, v5, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 +; SI-NEXT: v_cvt_f32_f16_e64 v1, |s2| +; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s3, v1 ; SI-NEXT: s_and_b64 vcc, s[0:1], vcc -; SI-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v6, vcc ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v2 +; SI-NEXT: v_cvt_f32_f16_e64 v2, |s4| +; SI-NEXT: v_cmp_nle_f32_e64 s[0:1], s3, v2 ; SI-NEXT: s_and_b64 vcc, s[0:1], vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s4, s10 -; CI-NEXT: s_mov_b32 s5, s11 -; CI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 -; CI-NEXT: s_mov_b32 s3, s7 -; CI-NEXT: buffer_load_dwordx2 v[7:8], off, s[0:3], 0 offset:32 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; CI-NEXT: s_mov_b32 s0, s10 +; CI-NEXT: s_mov_b32 s1, s11 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s7, s3 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v6 -; CI-NEXT: v_cmp_ngt_f32_e64 s[0:1], |v6|, |v7| -; CI-NEXT: v_and_b32_e32 v10, 0x7fffffff, v6 -; CI-NEXT: v_and_b32_e32 v9, 0x7fffffff, v7 -; CI-NEXT: s_and_b64 vcc, exec, s[0:1] -; CI-NEXT: s_cbranch_vccz .LBB10_2 +; CI-NEXT: v_readfirstlane_b32 s2, v1 +; CI-NEXT: v_readfirstlane_b32 s3, v0 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 +; CI-NEXT: v_cvt_f32_f16_e64 v3, |s3| +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e64 v2, |v0| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; CI-NEXT: s_cbranch_vccz .LBB10_3 ; CI-NEXT: ; %bb.1: ; %frem.else86 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_and_b32_e32 v11, 0x80000000, v6 -; CI-NEXT: v_cmp_eq_f32_e32 vcc, v10, v9 -; CI-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc -; CI-NEXT: s_cbranch_execz .LBB10_3 -; CI-NEXT: s_branch .LBB10_8 -; CI-NEXT: .LBB10_2: -; CI-NEXT: ; implicit-def: $vgpr8 -; CI-NEXT: .LBB10_3: ; %frem.compute85 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v13, v10 -; CI-NEXT: v_frexp_mant_f32_e32 v8, v10 -; CI-NEXT: v_frexp_mant_f32_e32 v10, v9 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v14, v9 -; CI-NEXT: v_ldexp_f32_e64 v9, v10, 1 -; CI-NEXT: v_div_scale_f32 v15, s[0:1], v9, v9, 1.0 -; CI-NEXT: v_ldexp_f32_e64 v11, v8, 11 -; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v14 -; CI-NEXT: v_not_b32_e32 v10, v8 -; CI-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CI-NEXT: v_div_scale_f32 v12, vcc, 1.0, v9, 1.0 -; CI-NEXT: v_rcp_f32_e32 v16, v15 +; CI-NEXT: s_and_b32 s4, s3, 0xffff8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: s_cselect_b32 s4, s4, s3 +; CI-NEXT: s_cbranch_execz .LBB10_4 +; CI-NEXT: ; %bb.2: +; CI-NEXT: v_mov_b32_e32 v3, s4 +; CI-NEXT: s_branch .LBB10_9 +; CI-NEXT: .LBB10_3: +; CI-NEXT: ; implicit-def: $sgpr4 +; CI-NEXT: .LBB10_4: ; %frem.compute85 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v3 +; CI-NEXT: v_frexp_mant_f32_e32 v3, v3 +; CI-NEXT: v_ldexp_f32_e64 v5, v3, 11 +; CI-NEXT: v_frexp_mant_f32_e32 v3, v2 +; CI-NEXT: v_ldexp_f32_e64 v3, v3, 1 +; CI-NEXT: v_div_scale_f32 v9, s[0:1], v3, v3, 1.0 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v2 +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v8 +; CI-NEXT: v_not_b32_e32 v4, v2 +; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CI-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0 +; CI-NEXT: v_rcp_f32_e32 v10, v9 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v17, -v15, v16, 1.0 -; CI-NEXT: v_fma_f32 v16, v17, v16, v16 -; CI-NEXT: v_mul_f32_e32 v17, v12, v16 -; CI-NEXT: v_fma_f32 v18, -v15, v17, v12 -; CI-NEXT: v_fma_f32 v17, v18, v16, v17 -; CI-NEXT: v_fma_f32 v12, -v15, v17, v12 +; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; CI-NEXT: v_fma_f32 v10, v11, v10, v10 +; CI-NEXT: v_mul_f32_e32 v11, v6, v10 +; CI-NEXT: v_fma_f32 v12, -v9, v11, v6 +; CI-NEXT: v_fma_f32 v11, v12, v10, v11 +; CI-NEXT: v_fma_f32 v6, -v9, v11, v6 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v12, v12, v16, v17 -; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v10 -; CI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0 -; CI-NEXT: s_cbranch_vccnz .LBB10_7 -; CI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader -; CI-NEXT: v_sub_i32_e32 v10, vcc, v13, v14 -; CI-NEXT: v_add_i32_e32 v10, vcc, 11, v10 -; CI-NEXT: .LBB10_5: ; %frem.loop_body93 +; CI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v4 +; CI-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_8 +; CI-NEXT: ; %bb.5: ; %frem.loop_body93.preheader +; CI-NEXT: v_sub_i32_e32 v4, vcc, v7, v8 +; CI-NEXT: v_add_i32_e32 v4, vcc, 11, v4 +; CI-NEXT: .LBB10_6: ; %frem.loop_body93 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 -; CI-NEXT: v_mov_b32_e32 v13, v11 -; CI-NEXT: v_mul_f32_e32 v11, v13, v12 -; CI-NEXT: v_rndne_f32_e32 v11, v11 -; CI-NEXT: v_fma_f32 v11, -v11, v9, v13 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 -; CI-NEXT: v_add_f32_e32 v14, v11, v9 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; CI-NEXT: v_add_i32_e32 v10, vcc, -11, v10 -; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v10 -; CI-NEXT: v_ldexp_f32_e64 v11, v11, 11 -; CI-NEXT: s_cbranch_vccnz .LBB10_5 -; CI-NEXT: ; %bb.6: ; %Flow133 -; CI-NEXT: v_mov_b32_e32 v11, v13 -; CI-NEXT: .LBB10_7: ; %frem.loop_exit94 -; CI-NEXT: v_add_i32_e32 v10, vcc, -10, v10 -; CI-NEXT: v_ldexp_f32_e32 v10, v11, v10 -; CI-NEXT: v_mul_f32_e32 v11, v10, v12 -; CI-NEXT: v_rndne_f32_e32 v11, v11 -; CI-NEXT: v_fma_f32 v10, -v11, v9, v10 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v10 -; CI-NEXT: v_add_f32_e32 v9, v10, v9 -; CI-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc -; CI-NEXT: v_ldexp_f32_e32 v8, v9, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: s_brev_b32 s0, -2 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_bfi_b32 v8, s0, v8, v6 -; CI-NEXT: .LBB10_8: -; CI-NEXT: v_cvt_f16_f32_e32 v9, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v11, |v9| -; CI-NEXT: v_cvt_f32_f16_e64 v10, |v10| -; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10 -; CI-NEXT: s_cbranch_vccz .LBB10_10 -; CI-NEXT: ; %bb.9: ; %frem.else53 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_and_b32_e32 v12, 0x80000000, v4 -; CI-NEXT: v_cmp_eq_f32_e32 vcc, v11, v10 -; CI-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc -; CI-NEXT: s_cbranch_execz .LBB10_11 -; CI-NEXT: s_branch .LBB10_16 -; CI-NEXT: .LBB10_10: -; CI-NEXT: ; implicit-def: $vgpr9 -; CI-NEXT: .LBB10_11: ; %frem.compute52 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v14, v11 -; CI-NEXT: v_frexp_mant_f32_e32 v9, v11 -; CI-NEXT: v_frexp_mant_f32_e32 v11, v10 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v15, v10 -; CI-NEXT: v_ldexp_f32_e64 v10, v11, 1 -; CI-NEXT: v_div_scale_f32 v16, s[0:1], v10, v10, 1.0 -; CI-NEXT: v_ldexp_f32_e64 v12, v9, 11 -; CI-NEXT: v_add_i32_e32 v9, vcc, -1, v15 -; CI-NEXT: v_not_b32_e32 v11, v9 -; CI-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; CI-NEXT: v_div_scale_f32 v13, vcc, 1.0, v10, 1.0 -; CI-NEXT: v_rcp_f32_e32 v17, v16 +; CI-NEXT: v_mov_b32_e32 v7, v5 +; CI-NEXT: v_mul_f32_e32 v5, v7, v6 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v5, -v5, v3, v7 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5 +; CI-NEXT: v_add_f32_e32 v8, v5, v3 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, -11, v4 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v4 +; CI-NEXT: v_ldexp_f32_e64 v5, v5, 11 +; CI-NEXT: s_cbranch_vccnz .LBB10_6 +; CI-NEXT: ; %bb.7: ; %Flow133 +; CI-NEXT: v_mov_b32_e32 v5, v7 +; CI-NEXT: .LBB10_8: ; %frem.loop_exit94 +; CI-NEXT: v_add_i32_e32 v4, vcc, -10, v4 +; CI-NEXT: v_ldexp_f32_e32 v4, v5, v4 +; CI-NEXT: v_mul_f32_e32 v5, v4, v6 +; CI-NEXT: v_rndne_f32_e32 v5, v5 +; CI-NEXT: v_fma_f32 v4, -v5, v3, v4 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4 +; CI-NEXT: v_add_f32_e32 v3, v4, v3 +; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; CI-NEXT: v_ldexp_f32_e32 v2, v3, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: s_and_b32 s0, s3, 0xffff8000 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; CI-NEXT: v_or_b32_e32 v3, s0, v2 +; CI-NEXT: .LBB10_9: ; %Flow136 +; CI-NEXT: s_lshr_b32 s4, s2, 16 +; CI-NEXT: s_lshr_b32 s5, s3, 16 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; CI-NEXT: ; %bb.10: +; CI-NEXT: v_cvt_f32_f16_e64 v6, |s5| +; CI-NEXT: v_cvt_f32_f16_e64 v5, |v4| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v5 +; CI-NEXT: s_cbranch_vccz .LBB10_13 +; CI-NEXT: ; %bb.11: ; %frem.else53 +; CI-NEXT: s_and_b32 s6, s5, 0x8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5 +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: s_cselect_b32 s6, s6, s5 +; CI-NEXT: s_cbranch_execz .LBB10_14 +; CI-NEXT: ; %bb.12: +; CI-NEXT: v_mov_b32_e32 v5, s6 +; CI-NEXT: s_branch .LBB10_19 +; CI-NEXT: .LBB10_13: +; CI-NEXT: ; implicit-def: $sgpr6 +; CI-NEXT: .LBB10_14: ; %frem.compute52 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v10, v6 +; CI-NEXT: v_frexp_mant_f32_e32 v6, v6 +; CI-NEXT: v_ldexp_f32_e64 v8, v6, 11 +; CI-NEXT: v_frexp_mant_f32_e32 v6, v5 +; CI-NEXT: v_ldexp_f32_e64 v6, v6, 1 +; CI-NEXT: v_div_scale_f32 v12, s[0:1], v6, v6, 1.0 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v11, v5 +; CI-NEXT: v_add_i32_e32 v5, vcc, -1, v11 +; CI-NEXT: v_not_b32_e32 v7, v5 +; CI-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v6, 1.0 +; CI-NEXT: v_rcp_f32_e32 v13, v12 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v18, -v16, v17, 1.0 -; CI-NEXT: v_fma_f32 v17, v18, v17, v17 -; CI-NEXT: v_mul_f32_e32 v18, v13, v17 -; CI-NEXT: v_fma_f32 v19, -v16, v18, v13 -; CI-NEXT: v_fma_f32 v18, v19, v17, v18 -; CI-NEXT: v_fma_f32 v13, -v16, v18, v13 +; CI-NEXT: v_fma_f32 v14, -v12, v13, 1.0 +; CI-NEXT: v_fma_f32 v13, v14, v13, v13 +; CI-NEXT: v_mul_f32_e32 v14, v9, v13 +; CI-NEXT: v_fma_f32 v15, -v12, v14, v9 +; CI-NEXT: v_fma_f32 v14, v15, v13, v14 +; CI-NEXT: v_fma_f32 v9, -v12, v14, v9 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v13, v13, v17, v18 -; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v11 -; CI-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0 -; CI-NEXT: s_cbranch_vccnz .LBB10_15 -; CI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader -; CI-NEXT: v_sub_i32_e32 v11, vcc, v14, v15 -; CI-NEXT: v_add_i32_e32 v11, vcc, 11, v11 -; CI-NEXT: .LBB10_13: ; %frem.loop_body60 +; CI-NEXT: v_div_fmas_f32 v9, v9, v13, v14 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v7 +; CI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_18 +; CI-NEXT: ; %bb.15: ; %frem.loop_body60.preheader +; CI-NEXT: v_sub_i32_e32 v7, vcc, v10, v11 +; CI-NEXT: v_add_i32_e32 v7, vcc, 11, v7 +; CI-NEXT: .LBB10_16: ; %frem.loop_body60 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 -; CI-NEXT: v_mov_b32_e32 v14, v12 -; CI-NEXT: v_mul_f32_e32 v12, v14, v13 -; CI-NEXT: v_rndne_f32_e32 v12, v12 -; CI-NEXT: v_fma_f32 v12, -v12, v10, v14 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 -; CI-NEXT: v_add_f32_e32 v15, v12, v10 -; CI-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc -; CI-NEXT: v_add_i32_e32 v11, vcc, -11, v11 -; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v11 -; CI-NEXT: v_ldexp_f32_e64 v12, v12, 11 -; CI-NEXT: s_cbranch_vccnz .LBB10_13 -; CI-NEXT: ; %bb.14: ; %Flow129 -; CI-NEXT: v_mov_b32_e32 v12, v14 -; CI-NEXT: .LBB10_15: ; %frem.loop_exit61 -; CI-NEXT: v_add_i32_e32 v11, vcc, -10, v11 -; CI-NEXT: v_ldexp_f32_e32 v11, v12, v11 -; CI-NEXT: v_mul_f32_e32 v12, v11, v13 -; CI-NEXT: v_rndne_f32_e32 v12, v12 -; CI-NEXT: v_fma_f32 v11, -v12, v10, v11 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v11 -; CI-NEXT: v_add_f32_e32 v10, v11, v10 -; CI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc -; CI-NEXT: v_ldexp_f32_e32 v9, v10, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: s_brev_b32 s0, -2 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_bfi_b32 v9, s0, v9, v4 -; CI-NEXT: .LBB10_16: -; CI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v12, |v10| -; CI-NEXT: v_cvt_f32_f16_e64 v11, |v11| -; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v12, v11 -; CI-NEXT: s_cbranch_vccz .LBB10_18 -; CI-NEXT: ; %bb.17: ; %frem.else20 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_and_b32_e32 v13, 0x80000000, v2 -; CI-NEXT: v_cmp_eq_f32_e32 vcc, v12, v11 -; CI-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; CI-NEXT: s_cbranch_execz .LBB10_19 -; CI-NEXT: s_branch .LBB10_24 -; CI-NEXT: .LBB10_18: -; CI-NEXT: ; implicit-def: $vgpr10 -; CI-NEXT: .LBB10_19: ; %frem.compute19 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v15, v12 -; CI-NEXT: v_frexp_mant_f32_e32 v10, v12 -; CI-NEXT: v_frexp_mant_f32_e32 v12, v11 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v16, v11 -; CI-NEXT: v_ldexp_f32_e64 v11, v12, 1 -; CI-NEXT: v_div_scale_f32 v17, s[0:1], v11, v11, 1.0 -; CI-NEXT: v_ldexp_f32_e64 v13, v10, 11 -; CI-NEXT: v_add_i32_e32 v10, vcc, -1, v16 -; CI-NEXT: v_not_b32_e32 v12, v10 -; CI-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; CI-NEXT: v_div_scale_f32 v14, vcc, 1.0, v11, 1.0 -; CI-NEXT: v_rcp_f32_e32 v18, v17 +; CI-NEXT: v_mov_b32_e32 v10, v8 +; CI-NEXT: v_mul_f32_e32 v8, v10, v9 +; CI-NEXT: v_rndne_f32_e32 v8, v8 +; CI-NEXT: v_fma_f32 v8, -v8, v6, v10 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v8 +; CI-NEXT: v_add_f32_e32 v11, v8, v6 +; CI-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc +; CI-NEXT: v_add_i32_e32 v7, vcc, -11, v7 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v7 +; CI-NEXT: v_ldexp_f32_e64 v8, v8, 11 +; CI-NEXT: s_cbranch_vccnz .LBB10_16 +; CI-NEXT: ; %bb.17: ; %Flow129 +; CI-NEXT: v_mov_b32_e32 v8, v10 +; CI-NEXT: .LBB10_18: ; %frem.loop_exit61 +; CI-NEXT: v_add_i32_e32 v7, vcc, -10, v7 +; CI-NEXT: v_ldexp_f32_e32 v7, v8, v7 +; CI-NEXT: v_mul_f32_e32 v8, v7, v9 +; CI-NEXT: v_rndne_f32_e32 v8, v8 +; CI-NEXT: v_fma_f32 v7, -v8, v6, v7 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7 +; CI-NEXT: v_add_f32_e32 v6, v7, v6 +; CI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; CI-NEXT: v_ldexp_f32_e32 v5, v6, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: s_and_b32 s0, s5, 0x8000 +; CI-NEXT: v_and_b32_e32 v5, 0x7fff, v5 +; CI-NEXT: v_or_b32_e32 v5, s0, v5 +; CI-NEXT: .LBB10_19: +; CI-NEXT: v_cvt_f32_f16_e64 v7, |s2| +; CI-NEXT: v_cvt_f32_f16_e64 v6, |v1| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 +; CI-NEXT: s_cbranch_vccz .LBB10_22 +; CI-NEXT: ; %bb.20: ; %frem.else20 +; CI-NEXT: s_and_b32 s6, s2, 0xffff8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v7, v6 +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: s_cselect_b32 s6, s6, s2 +; CI-NEXT: s_cbranch_execz .LBB10_23 +; CI-NEXT: ; %bb.21: +; CI-NEXT: v_mov_b32_e32 v6, s6 +; CI-NEXT: s_branch .LBB10_28 +; CI-NEXT: .LBB10_22: +; CI-NEXT: ; implicit-def: $sgpr6 +; CI-NEXT: .LBB10_23: ; %frem.compute19 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v11, v7 +; CI-NEXT: v_frexp_mant_f32_e32 v7, v7 +; CI-NEXT: v_ldexp_f32_e64 v9, v7, 11 +; CI-NEXT: v_frexp_mant_f32_e32 v7, v6 +; CI-NEXT: v_ldexp_f32_e64 v7, v7, 1 +; CI-NEXT: v_div_scale_f32 v13, s[0:1], v7, v7, 1.0 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v12, v6 +; CI-NEXT: v_add_i32_e32 v6, vcc, -1, v12 +; CI-NEXT: v_not_b32_e32 v8, v6 +; CI-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v7, 1.0 +; CI-NEXT: v_rcp_f32_e32 v14, v13 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v19, -v17, v18, 1.0 -; CI-NEXT: v_fma_f32 v18, v19, v18, v18 -; CI-NEXT: v_mul_f32_e32 v19, v14, v18 -; CI-NEXT: v_fma_f32 v20, -v17, v19, v14 -; CI-NEXT: v_fma_f32 v19, v20, v18, v19 -; CI-NEXT: v_fma_f32 v14, -v17, v19, v14 +; CI-NEXT: v_fma_f32 v15, -v13, v14, 1.0 +; CI-NEXT: v_fma_f32 v14, v15, v14, v14 +; CI-NEXT: v_mul_f32_e32 v15, v10, v14 +; CI-NEXT: v_fma_f32 v16, -v13, v15, v10 +; CI-NEXT: v_fma_f32 v15, v16, v14, v15 +; CI-NEXT: v_fma_f32 v10, -v13, v15, v10 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v14, v14, v18, v19 -; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v12 -; CI-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0 -; CI-NEXT: s_cbranch_vccnz .LBB10_23 -; CI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader -; CI-NEXT: v_sub_i32_e32 v12, vcc, v15, v16 -; CI-NEXT: v_add_i32_e32 v12, vcc, 11, v12 -; CI-NEXT: .LBB10_21: ; %frem.loop_body27 +; CI-NEXT: v_div_fmas_f32 v10, v10, v14, v15 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v8 +; CI-NEXT: v_div_fixup_f32 v10, v10, v7, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_27 +; CI-NEXT: ; %bb.24: ; %frem.loop_body27.preheader +; CI-NEXT: v_sub_i32_e32 v8, vcc, v11, v12 +; CI-NEXT: v_add_i32_e32 v8, vcc, 11, v8 +; CI-NEXT: .LBB10_25: ; %frem.loop_body27 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 -; CI-NEXT: v_mov_b32_e32 v15, v13 -; CI-NEXT: v_mul_f32_e32 v13, v15, v14 -; CI-NEXT: v_rndne_f32_e32 v13, v13 -; CI-NEXT: v_fma_f32 v13, -v13, v11, v15 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v13 -; CI-NEXT: v_add_f32_e32 v16, v13, v11 -; CI-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc -; CI-NEXT: v_add_i32_e32 v12, vcc, -11, v12 -; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v12 -; CI-NEXT: v_ldexp_f32_e64 v13, v13, 11 -; CI-NEXT: s_cbranch_vccnz .LBB10_21 -; CI-NEXT: ; %bb.22: ; %Flow125 -; CI-NEXT: v_mov_b32_e32 v13, v15 -; CI-NEXT: .LBB10_23: ; %frem.loop_exit28 -; CI-NEXT: v_add_i32_e32 v12, vcc, -10, v12 -; CI-NEXT: v_ldexp_f32_e32 v12, v13, v12 -; CI-NEXT: v_mul_f32_e32 v13, v12, v14 -; CI-NEXT: v_rndne_f32_e32 v13, v13 -; CI-NEXT: v_fma_f32 v12, -v13, v11, v12 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v12 -; CI-NEXT: v_add_f32_e32 v11, v12, v11 -; CI-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc -; CI-NEXT: v_ldexp_f32_e32 v10, v11, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: s_brev_b32 s0, -2 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_bfi_b32 v10, s0, v10, v2 -; CI-NEXT: .LBB10_24: -; CI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v13, |v11| -; CI-NEXT: v_cvt_f32_f16_e64 v12, |v12| -; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12 -; CI-NEXT: s_cbranch_vccz .LBB10_26 -; CI-NEXT: ; %bb.25: ; %frem.else -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_and_b32_e32 v14, 0x80000000, v0 -; CI-NEXT: v_cmp_eq_f32_e32 vcc, v13, v12 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; CI-NEXT: s_cbranch_execz .LBB10_27 -; CI-NEXT: s_branch .LBB10_32 -; CI-NEXT: .LBB10_26: -; CI-NEXT: ; implicit-def: $vgpr11 -; CI-NEXT: .LBB10_27: ; %frem.compute -; CI-NEXT: v_frexp_exp_i32_f32_e32 v16, v13 -; CI-NEXT: v_frexp_mant_f32_e32 v11, v13 -; CI-NEXT: v_frexp_mant_f32_e32 v13, v12 -; CI-NEXT: v_frexp_exp_i32_f32_e32 v17, v12 -; CI-NEXT: v_ldexp_f32_e64 v12, v13, 1 -; CI-NEXT: v_div_scale_f32 v18, s[0:1], v12, v12, 1.0 -; CI-NEXT: v_ldexp_f32_e64 v14, v11, 11 -; CI-NEXT: v_add_i32_e32 v11, vcc, -1, v17 -; CI-NEXT: v_not_b32_e32 v13, v11 -; CI-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; CI-NEXT: v_div_scale_f32 v15, vcc, 1.0, v12, 1.0 -; CI-NEXT: v_rcp_f32_e32 v19, v18 +; CI-NEXT: v_mov_b32_e32 v11, v9 +; CI-NEXT: v_mul_f32_e32 v9, v11, v10 +; CI-NEXT: v_rndne_f32_e32 v9, v9 +; CI-NEXT: v_fma_f32 v9, -v9, v7, v11 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v9 +; CI-NEXT: v_add_f32_e32 v12, v9, v7 +; CI-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc +; CI-NEXT: v_add_i32_e32 v8, vcc, -11, v8 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v8 +; CI-NEXT: v_ldexp_f32_e64 v9, v9, 11 +; CI-NEXT: s_cbranch_vccnz .LBB10_25 +; CI-NEXT: ; %bb.26: ; %Flow125 +; CI-NEXT: v_mov_b32_e32 v9, v11 +; CI-NEXT: .LBB10_27: ; %frem.loop_exit28 +; CI-NEXT: v_add_i32_e32 v8, vcc, -10, v8 +; CI-NEXT: v_ldexp_f32_e32 v8, v9, v8 +; CI-NEXT: v_mul_f32_e32 v9, v8, v10 +; CI-NEXT: v_rndne_f32_e32 v9, v9 +; CI-NEXT: v_fma_f32 v8, -v9, v7, v8 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v8 +; CI-NEXT: v_add_f32_e32 v7, v8, v7 +; CI-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; CI-NEXT: v_ldexp_f32_e32 v6, v7, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: s_and_b32 s0, s2, 0xffff8000 +; CI-NEXT: v_and_b32_e32 v6, 0x7fff, v6 +; CI-NEXT: v_or_b32_e32 v6, s0, v6 +; CI-NEXT: .LBB10_28: +; CI-NEXT: v_cvt_f32_f16_e64 v8, |s4| +; CI-NEXT: v_cvt_f32_f16_e64 v7, |v2| +; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v8, v7 +; CI-NEXT: s_cbranch_vccz .LBB10_31 +; CI-NEXT: ; %bb.29: ; %frem.else +; CI-NEXT: s_and_b32 s6, s4, 0x8000 +; CI-NEXT: v_cmp_eq_f32_e32 vcc, v8, v7 +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: s_cselect_b32 s6, s6, s4 +; CI-NEXT: s_cbranch_execz .LBB10_32 +; CI-NEXT: ; %bb.30: +; CI-NEXT: v_mov_b32_e32 v7, s6 +; CI-NEXT: s_branch .LBB10_37 +; CI-NEXT: .LBB10_31: +; CI-NEXT: ; implicit-def: $sgpr6 +; CI-NEXT: .LBB10_32: ; %frem.compute +; CI-NEXT: v_frexp_exp_i32_f32_e32 v12, v8 +; CI-NEXT: v_frexp_mant_f32_e32 v8, v8 +; CI-NEXT: v_ldexp_f32_e64 v10, v8, 11 +; CI-NEXT: v_frexp_mant_f32_e32 v8, v7 +; CI-NEXT: v_ldexp_f32_e64 v8, v8, 1 +; CI-NEXT: v_div_scale_f32 v14, s[0:1], v8, v8, 1.0 +; CI-NEXT: v_frexp_exp_i32_f32_e32 v13, v7 +; CI-NEXT: v_add_i32_e32 v7, vcc, -1, v13 +; CI-NEXT: v_not_b32_e32 v9, v7 +; CI-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CI-NEXT: v_div_scale_f32 v11, vcc, 1.0, v8, 1.0 +; CI-NEXT: v_rcp_f32_e32 v15, v14 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v20, -v18, v19, 1.0 -; CI-NEXT: v_fma_f32 v19, v20, v19, v19 -; CI-NEXT: v_mul_f32_e32 v20, v15, v19 -; CI-NEXT: v_fma_f32 v21, -v18, v20, v15 -; CI-NEXT: v_fma_f32 v20, v21, v19, v20 -; CI-NEXT: v_fma_f32 v15, -v18, v20, v15 +; CI-NEXT: v_fma_f32 v16, -v14, v15, 1.0 +; CI-NEXT: v_fma_f32 v15, v16, v15, v15 +; CI-NEXT: v_mul_f32_e32 v16, v11, v15 +; CI-NEXT: v_fma_f32 v17, -v14, v16, v11 +; CI-NEXT: v_fma_f32 v16, v17, v15, v16 +; CI-NEXT: v_fma_f32 v11, -v14, v16, v11 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v15, v15, v19, v20 -; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v13 -; CI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0 -; CI-NEXT: s_cbranch_vccnz .LBB10_31 -; CI-NEXT: ; %bb.28: ; %frem.loop_body.preheader -; CI-NEXT: v_sub_i32_e32 v13, vcc, v16, v17 -; CI-NEXT: v_add_i32_e32 v13, vcc, 11, v13 -; CI-NEXT: .LBB10_29: ; %frem.loop_body +; CI-NEXT: v_div_fmas_f32 v11, v11, v15, v16 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v9 +; CI-NEXT: v_div_fixup_f32 v11, v11, v8, 1.0 +; CI-NEXT: s_cbranch_vccnz .LBB10_36 +; CI-NEXT: ; %bb.33: ; %frem.loop_body.preheader +; CI-NEXT: v_sub_i32_e32 v9, vcc, v12, v13 +; CI-NEXT: v_add_i32_e32 v9, vcc, 11, v9 +; CI-NEXT: .LBB10_34: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 -; CI-NEXT: v_mov_b32_e32 v16, v14 -; CI-NEXT: v_mul_f32_e32 v14, v16, v15 -; CI-NEXT: v_rndne_f32_e32 v14, v14 -; CI-NEXT: v_fma_f32 v14, -v14, v12, v16 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v14 -; CI-NEXT: v_add_f32_e32 v17, v14, v12 -; CI-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc -; CI-NEXT: v_add_i32_e32 v13, vcc, -11, v13 -; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v13 -; CI-NEXT: v_ldexp_f32_e64 v14, v14, 11 -; CI-NEXT: s_cbranch_vccnz .LBB10_29 -; CI-NEXT: ; %bb.30: ; %Flow -; CI-NEXT: v_mov_b32_e32 v14, v16 -; CI-NEXT: .LBB10_31: ; %frem.loop_exit -; CI-NEXT: v_add_i32_e32 v13, vcc, -10, v13 -; CI-NEXT: v_ldexp_f32_e32 v13, v14, v13 -; CI-NEXT: v_mul_f32_e32 v14, v13, v15 -; CI-NEXT: v_rndne_f32_e32 v14, v14 -; CI-NEXT: v_fma_f32 v13, -v14, v12, v13 -; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v13 -; CI-NEXT: v_add_f32_e32 v12, v13, v12 -; CI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; CI-NEXT: v_ldexp_f32_e32 v11, v12, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: s_brev_b32 s0, -2 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_bfi_b32 v11, s0, v11, v0 -; CI-NEXT: .LBB10_32: ; %Flow124 +; CI-NEXT: v_mov_b32_e32 v12, v10 +; CI-NEXT: v_mul_f32_e32 v10, v12, v11 +; CI-NEXT: v_rndne_f32_e32 v10, v10 +; CI-NEXT: v_fma_f32 v10, -v10, v8, v12 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v10 +; CI-NEXT: v_add_f32_e32 v13, v10, v8 +; CI-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; CI-NEXT: v_add_i32_e32 v9, vcc, -11, v9 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v9 +; CI-NEXT: v_ldexp_f32_e64 v10, v10, 11 +; CI-NEXT: s_cbranch_vccnz .LBB10_34 +; CI-NEXT: ; %bb.35: ; %Flow +; CI-NEXT: v_mov_b32_e32 v10, v12 +; CI-NEXT: .LBB10_36: ; %frem.loop_exit +; CI-NEXT: v_add_i32_e32 v9, vcc, -10, v9 +; CI-NEXT: v_ldexp_f32_e32 v9, v10, v9 +; CI-NEXT: v_mul_f32_e32 v10, v9, v11 +; CI-NEXT: v_rndne_f32_e32 v10, v10 +; CI-NEXT: v_fma_f32 v9, -v10, v8, v9 +; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v9 +; CI-NEXT: v_add_f32_e32 v8, v9, v8 +; CI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; CI-NEXT: v_ldexp_f32_e32 v7, v8, v7 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: s_mov_b32 s2, 0x7f800000 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v8 -; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v6 -; CI-NEXT: s_and_b64 vcc, s[0:1], vcc -; CI-NEXT: v_mov_b32_e32 v6, 0x7fc00000 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v4, |v4| -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc -; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v4 -; CI-NEXT: s_and_b64 vcc, s[0:1], vcc -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc -; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v10 +; CI-NEXT: s_and_b32 s0, s4, 0x8000 +; CI-NEXT: v_and_b32_e32 v7, 0x7fff, v7 +; CI-NEXT: v_or_b32_e32 v7, s0, v7 +; CI-NEXT: .LBB10_37: ; %Flow124 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |s3| +; CI-NEXT: s_mov_b32 s3, 0x7f800000 +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s3, v0 ; CI-NEXT: s_and_b64 vcc, s[0:1], vcc -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc +; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 +; CI-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v4, |s5| +; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s3, v4 +; CI-NEXT: s_and_b64 vcc, s[0:1], vcc +; CI-NEXT: v_cndmask_b32_e32 v4, v0, v5, vcc ; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s2, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v1, |s2| +; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s3, v1 ; CI-NEXT: s_and_b64 vcc, s[0:1], vcc -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cndmask_b32_e32 v1, v0, v6, vcc +; CI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v2 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |s4| +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_cmp_nle_f32_e64 s[0:1], s3, v2 +; CI-NEXT: s_and_b64 vcc, s[0:1], vcc +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_or_b32_e32 v1, v2, v0 +; CI-NEXT: v_or_b32_e32 v1, v1, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; CI-NEXT: v_or_b32_e32 v0, v7, v0 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; CI-NEXT: v_or_b32_e32 v0, v2, v0 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; CI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index d4bddf26d0ed3..4b7e08f814b5b 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -9207,14 +9207,12 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9250,14 +9248,12 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -9726,14 +9722,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9770,14 +9764,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10250,14 +10242,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10294,14 +10284,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10746,23 +10734,21 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 @@ -10788,24 +10774,22 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 @@ -11249,14 +11233,12 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -11292,14 +11274,12 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -11757,14 +11737,12 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -11800,14 +11778,12 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -12160,8 +12136,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12195,8 +12169,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -12533,10 +12505,8 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12567,10 +12537,8 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -13041,14 +13009,12 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -13085,14 +13051,12 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -13552,14 +13516,12 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -13595,14 +13557,12 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -19572,49 +19532,39 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB64_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -19626,51 +19576,40 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB64_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB64_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result @@ -19826,49 +19765,39 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB65_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19880,51 +19809,40 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB65_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB65_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -20081,53 +19999,43 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB66_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -20139,55 +20047,44 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB66_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB66_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -20325,41 +20222,32 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB67_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20375,47 +20263,37 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB67_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB67_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -20554,41 +20432,32 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB68_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20604,47 +20473,37 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB68_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -20784,45 +20643,36 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB69_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20838,51 +20688,41 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB69_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -21042,49 +20882,39 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB70_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -21096,51 +20926,40 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB70_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -21283,41 +21102,32 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB71_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21333,47 +21143,37 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB71_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -21543,49 +21343,39 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB72_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: @@ -21597,51 +21387,40 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB72_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret <2 x half> %result @@ -21804,41 +21583,32 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB73_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21854,47 +21624,37 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB73_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void @@ -22049,49 +21809,39 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB74_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -22103,51 +21853,40 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB74_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x half> %result @@ -22284,41 +22023,32 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB75_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22334,47 +22064,37 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB75_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret void @@ -22543,49 +22263,39 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB76_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: @@ -22597,51 +22307,40 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB76_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result @@ -22804,41 +22503,32 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB77_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22854,47 +22544,37 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB77_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst ret void diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index bcf51f89920c0..041a77c960f04 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -4777,14 +4777,12 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4820,14 +4818,12 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -5233,14 +5229,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5277,14 +5271,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -5693,14 +5685,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5737,14 +5727,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6125,23 +6113,21 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 @@ -6167,24 +6153,22 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 @@ -6569,14 +6553,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6612,14 +6594,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7016,14 +6996,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7059,14 +7037,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7374,8 +7350,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7409,8 +7383,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7705,10 +7677,8 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7739,10 +7709,8 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8151,14 +8119,12 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8195,14 +8161,12 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8603,14 +8567,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8646,14 +8608,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -13773,49 +13733,39 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -13827,51 +13777,40 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result @@ -14076,49 +14015,39 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14130,51 +14059,40 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14380,53 +14298,43 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14438,55 +14346,44 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14684,41 +14581,32 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14734,47 +14622,37 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -14973,41 +14851,32 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15023,47 +14892,37 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15263,45 +15122,36 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15317,51 +15167,41 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB51_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15570,49 +15410,39 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15624,51 +15454,40 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15871,41 +15690,32 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15921,47 +15731,37 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_max_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB53_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 9406e08e9e412..e13a16b762d6d 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -4777,14 +4777,12 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4820,14 +4818,12 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -5233,14 +5229,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5277,14 +5271,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -5693,14 +5685,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5737,14 +5727,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6125,23 +6113,21 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 @@ -6167,24 +6153,22 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 @@ -6569,14 +6553,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6612,14 +6594,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7016,14 +6996,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7059,14 +7037,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7374,8 +7350,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7409,8 +7383,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7705,10 +7677,8 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7739,10 +7709,8 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8151,14 +8119,12 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8195,14 +8161,12 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8603,14 +8567,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8646,14 +8608,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -13773,49 +13733,39 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -13827,51 +13777,40 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result @@ -14076,49 +14015,39 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14130,51 +14059,40 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14380,53 +14298,43 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14438,55 +14346,44 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14684,41 +14581,32 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14734,47 +14622,37 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -14973,41 +14851,32 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15023,47 +14892,37 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15263,45 +15122,36 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15317,51 +15167,41 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB51_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15570,49 +15410,39 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15624,51 +15454,40 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -15871,41 +15690,32 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15921,47 +15731,37 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_min_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB53_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index f4b7280062bb8..0229a482ca17b 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -5513,14 +5513,12 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5556,14 +5554,12 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -5944,14 +5940,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5988,14 +5982,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6379,14 +6371,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6423,14 +6413,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6791,23 +6779,21 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 @@ -6833,24 +6819,22 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_sub_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 @@ -7210,14 +7194,12 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7253,14 +7235,12 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7632,14 +7612,12 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7675,14 +7653,12 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7970,8 +7946,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8005,8 +7979,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8281,10 +8253,8 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8315,10 +8285,8 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8702,14 +8670,12 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8746,14 +8712,12 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9129,14 +9093,12 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9172,14 +9134,12 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -14262,49 +14222,39 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_v2f16: @@ -14316,51 +14266,40 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result @@ -14548,49 +14487,39 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_pos: @@ -14602,51 +14531,40 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %result = atomicrmw fsub ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -14835,53 +14753,43 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_neg: @@ -14893,55 +14801,44 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 %result = atomicrmw fsub ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -15120,41 +15017,32 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15170,47 +15058,37 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst ret void @@ -15390,41 +15268,32 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15440,47 +15309,37 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %unused = atomicrmw fsub ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -15661,45 +15520,36 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15715,51 +15565,41 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 %unused = atomicrmw fsub ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -15951,49 +15791,39 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fsub_ret_v2f16__offset12b_pos: @@ -16005,51 +15835,40 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %result = atomicrmw fsub ptr addrspace(1) %gep, <2 x half> %val seq_cst @@ -16233,41 +16052,32 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX7-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16283,47 +16093,37 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX6-NEXT: v_sub_f32_e32 v7, v7, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 %unused = atomicrmw fsub ptr addrspace(1) %gep, <2 x half> %val seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 5154ba95aec78..8048bf6b6e4e5 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -404,11 +404,11 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s4, s3, 16 -; CI-NEXT: s_lshr_b32 s5, s2, 16 +; CI-NEXT: s_lshr_b32 s4, s2, 16 +; CI-NEXT: s_lshr_b32 s5, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v4, s0 @@ -460,12 +460,11 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s6, s1, 16 -; CI-NEXT: s_lshr_b32 s7, s0, 16 -; CI-NEXT: s_lshr_b32 s8, s3, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s6 ; CI-NEXT: s_lshr_b32 s6, s2, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; CI-NEXT: s_lshr_b32 s7, s3, 16 +; CI-NEXT: s_lshr_b32 s8, s0, 16 +; CI-NEXT: s_lshr_b32 s9, s1, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v7, s7 ; CI-NEXT: v_cvt_f32_f16_e32 v5, s6 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: v_cvt_f32_f16_e32 v6, s3 @@ -473,7 +472,8 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; CI-NEXT: s_add_u32 s0, s4, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s1 ; CI-NEXT: s_addc_u32 s1, s5, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s8 ; CI-NEXT: v_mov_b32_e32 v9, s1 ; CI-NEXT: v_mov_b32_e32 v8, s0 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] @@ -652,53 +652,29 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 } define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { -; CI-LABEL: extload_v3f16_to_v3f64_arg: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; CI-NEXT: s_lshr_b32 s4, s2, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; CI-NEXT: s_add_u32 s2, s0, 16 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v0 -; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; CI-NEXT: v_mov_b32_e32 v7, s3 -; CI-NEXT: v_mov_b32_e32 v6, s2 -; CI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; CI-NEXT: s_endpgm -; -; VI-LABEL: extload_v3f16_to_v3f64_arg: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; VI-NEXT: s_add_u32 s2, s0, 16 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; VI-NEXT: v_mov_b32_e32 v7, s3 -; VI-NEXT: v_mov_b32_e32 v6, s2 -; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm +; CIVI-LABEL: extload_v3f16_to_v3f64_arg: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; CIVI-NEXT: s_lshr_b32 s4, s2, 16 +; CIVI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; CIVI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; CIVI-NEXT: s_add_u32 s2, s0, 16 +; CIVI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 +; CIVI-NEXT: s_addc_u32 s3, s1, 0 +; CIVI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CIVI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CIVI-NEXT: v_mov_b32_e32 v7, s3 +; CIVI-NEXT: v_mov_b32_e32 v6, s2 +; CIVI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] +; CIVI-NEXT: v_mov_b32_e32 v5, s1 +; CIVI-NEXT: v_mov_b32_e32 v4, s0 +; CIVI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CIVI-NEXT: s_endpgm ; ; GFX11-LABEL: extload_v3f16_to_v3f64_arg: ; GFX11: ; %bb.0: @@ -815,37 +791,37 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s9, s0, 16 ; CI-NEXT: s_lshr_b32 s6, s3, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; CI-NEXT: v_cvt_f32_f16_e32 v12, s3 ; CI-NEXT: s_lshr_b32 s7, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s7 +; CI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; CI-NEXT: v_cvt_f32_f16_e32 v12, s3 ; CI-NEXT: s_lshr_b32 s8, s1, 16 -; CI-NEXT: s_lshr_b32 s6, s0, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: v_cvt_f32_f16_e32 v8, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v9, s0 ; CI-NEXT: s_add_u32 s0, s4, 48 -; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 -; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s1 ; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 ; CI-NEXT: s_addc_u32 s1, s5, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s8 ; CI-NEXT: v_mov_b32_e32 v17, s1 ; CI-NEXT: v_mov_b32_e32 v16, s0 ; CI-NEXT: s_add_u32 s0, s4, 32 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v1 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 ; CI-NEXT: s_addc_u32 s1, s5, 0 ; CI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v1 ; CI-NEXT: v_mov_b32_e32 v13, s1 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; CI-NEXT: v_mov_b32_e32 v12, s0 ; CI-NEXT: s_add_u32 s0, s4, 16 ; CI-NEXT: s_addc_u32 s1, s5, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; CI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; CI-NEXT: s_nop 0 ; CI-NEXT: v_mov_b32_e32 v9, s1 @@ -1134,12 +1110,12 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_dword v1, v[0:1] +; CI-NEXT: flat_load_dword v0, v[0:1] ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1205,14 +1181,14 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_dwordx2 v[1:2], v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v4, s1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_mov_b32_e32 v3, s0 ; CI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; CI-NEXT: s_endpgm ; @@ -1280,14 +1256,14 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_dwordx2 v[3:4], v[0:1] +; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v4 ; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1368,18 +1344,18 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v13, s1 ; CI-NEXT: v_mov_b32_e32 v12, s0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v14 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_store_dwordx4 v[0:1], v[8:11] @@ -1473,61 +1449,61 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s4, s2, 16 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: s_add_u32 s2, s2, 16 +; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: s_addc_u32 s5, s3, 0 -; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v4, s2 -; CI-NEXT: v_mov_b32_e32 v1, s5 -; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v14, s3 -; CI-NEXT: v_mov_b32_e32 v13, s2 +; CI-NEXT: v_mov_b32_e32 v16, s3 +; CI-NEXT: v_mov_b32_e32 v15, s2 ; CI-NEXT: s_add_u32 s2, s0, 48 ; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v6 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; CI-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; CI-NEXT: flat_store_dwordx4 v[15:16], v[8:11] ; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v16 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: s_add_u32 s0, s0, 32 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; CI-NEXT: v_mov_b32_e32 v11, s3 ; CI-NEXT: v_mov_b32_e32 v17, s1 -; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_mov_b32_e32 v10, s2 ; CI-NEXT: v_mov_b32_e32 v16, s0 -; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; CI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; CI-NEXT: flat_store_dwordx4 v[16:17], v[6:9] +; CI-NEXT: flat_store_dwordx4 v[4:5], v[12:15] +; CI-NEXT: flat_store_dwordx4 v[10:11], v[6:9] +; CI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v16f16_to_v16f32: @@ -1917,21 +1893,21 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v11, s3 ; CI-NEXT: v_mov_b32_e32 v9, s1 +; CI-NEXT: v_mov_b32_e32 v10, s2 ; CI-NEXT: v_mov_b32_e32 v8, s0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v3 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v2 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10 -; CI-NEXT: v_mov_b32_e32 v11, s3 -; CI-NEXT: v_mov_b32_e32 v10, s2 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 ; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; CI-NEXT: s_endpgm @@ -2045,26 +2021,26 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v14, s2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v5 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v8 ; CI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v18, v11 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 ; CI-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v16 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v18 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v16 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v11 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v19 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v18 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v17 ; CI-NEXT: v_mov_b32_e32 v17, s1 ; CI-NEXT: v_mov_b32_e32 v16, s0 ; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] @@ -2202,91 +2178,92 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; CI-NEXT: s_add_u32 s2, s2, 16 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: v_mov_b32_e32 v4, s2 -; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: s_add_u32 s2, s0, 48 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v15, s3 -; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_mov_b32_e32 v14, s3 +; CI-NEXT: v_mov_b32_e32 v13, s2 ; CI-NEXT: s_add_u32 s2, s0, 32 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v17, s3 -; CI-NEXT: v_mov_b32_e32 v16, s2 +; CI-NEXT: v_mov_b32_e32 v16, s3 +; CI-NEXT: v_mov_b32_e32 v15, s2 ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v19, s3 -; CI-NEXT: v_mov_b32_e32 v18, s2 +; CI-NEXT: v_mov_b32_e32 v18, s3 +; CI-NEXT: v_mov_b32_e32 v17, s2 ; CI-NEXT: s_add_u32 s2, s0, 0x70 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v13, s1 -; CI-NEXT: v_mov_b32_e32 v12, s0 +; CI-NEXT: v_mov_b32_e32 v12, s1 +; CI-NEXT: v_mov_b32_e32 v11, s0 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v8 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v21, v5 -; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; CI-NEXT: v_mov_b32_e32 v15, s3 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; CI-NEXT: v_mov_b32_e32 v14, s2 -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 +; CI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 +; CI-NEXT: v_cvt_f32_f16_e32 v20, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v3 +; CI-NEXT: flat_store_dwordx4 v[13:14], v[7:10] +; CI-NEXT: s_nop 0 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; CI-NEXT: v_mov_b32_e32 v14, s3 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; CI-NEXT: v_mov_b32_e32 v13, s2 +; CI-NEXT: s_add_u32 s2, s0, 0x60 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] +; CI-NEXT: v_mov_b32_e32 v16, s3 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 +; CI-NEXT: v_mov_b32_e32 v15, s2 +; CI-NEXT: s_add_u32 s2, s0, 0x50 +; CI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; CI-NEXT: s_add_u32 s2, s0, 0x60 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; CI-NEXT: v_mov_b32_e32 v17, s3 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v5 +; CI-NEXT: flat_store_dwordx4 v[11:12], v[0:3] +; CI-NEXT: v_cvt_f32_f16_e32 v12, v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v20 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v5 -; CI-NEXT: v_mov_b32_e32 v16, s2 -; CI-NEXT: s_add_u32 s2, s0, 0x50 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v20 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 ; CI-NEXT: s_add_u32 s0, s0, 64 -; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[13:14], v[0:3] ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 -; CI-NEXT: v_mov_b32_e32 v19, s3 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v12 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v22 +; CI-NEXT: v_mov_b32_e32 v18, s3 ; CI-NEXT: v_mov_b32_e32 v13, s1 -; CI-NEXT: v_mov_b32_e32 v18, s2 +; CI-NEXT: v_mov_b32_e32 v17, s2 ; CI-NEXT: v_mov_b32_e32 v12, s0 -; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[15:16], v[8:11] +; CI-NEXT: flat_store_dwordx4 v[17:18], v[0:3] ; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; CI-NEXT: s_endpgm ; @@ -3146,11 +3123,11 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_lshr_b32 s0, s0, 16 +; CI-NEXT: s_lshr_b32 s1, s0, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CI-NEXT: v_add_f32_e32 v0, v0, v1 +; CI-NEXT: v_add_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -3213,14 +3190,14 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: s_lshr_b32 s2, s3, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; CI-NEXT: v_add_f32_e32 v0, v0, v1 +; CI-NEXT: s_lshr_b32 s5, s3, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; CI-NEXT: v_add_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_add_f32_e32 v1, v2, v3 +; CI-NEXT: v_add_f32_e32 v1, v3, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v2, v0, v1 @@ -3276,30 +3253,30 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v3 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_add_f32_e32 v7, v7, v9 -; CI-NEXT: v_add_f32_e32 v6, v6, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; CI-NEXT: v_add_f32_e32 v1, v1, v3 ; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: v_add_f32_e32 v2, v7, v9 +; CI-NEXT: v_add_f32_e32 v3, v6, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_or_b32_e32 v1, v2, v1 -; CI-NEXT: v_or_b32_e32 v0, v3, v0 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_or_b32_e32 v1, v1, v2 +; CI-NEXT: v_or_b32_e32 v0, v0, v3 ; CI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; CI-NEXT: s_endpgm ; @@ -3350,58 +3327,58 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 ; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s10, s0, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v4, s0 -; CI-NEXT: s_lshr_b32 s0, s4, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v8, s0 -; CI-NEXT: s_lshr_b32 s0, s5, 16 -; CI-NEXT: s_lshr_b32 s11, s1, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v9, s0 -; CI-NEXT: s_lshr_b32 s0, s6, 16 +; CI-NEXT: s_lshr_b32 s13, s3, 16 +; CI-NEXT: s_lshr_b32 s14, s7, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s14 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s13 ; CI-NEXT: s_lshr_b32 s12, s2, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s10 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s11 -; CI-NEXT: s_lshr_b32 s10, s3, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v10, s0 -; CI-NEXT: s_lshr_b32 s0, s7, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s12 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s10 -; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 -; CI-NEXT: v_cvt_f32_f16_e32 v11, s0 -; CI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; CI-NEXT: v_cvt_f32_f16_e32 v13, s5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v7, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v14, s7 -; CI-NEXT: v_cvt_f32_f16_e32 v15, s6 -; CI-NEXT: v_add_f32_e32 v1, v1, v9 -; CI-NEXT: v_add_f32_e32 v0, v0, v8 -; CI-NEXT: v_add_f32_e32 v3, v3, v11 -; CI-NEXT: v_add_f32_e32 v2, v2, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v5, v5, v13 +; CI-NEXT: s_lshr_b32 s15, s6, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; CI-NEXT: v_add_f32_e32 v0, v1, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s15 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s12 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_add_f32_e32 v4, v4, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_add_f32_e32 v7, v7, v14 +; CI-NEXT: v_add_f32_e32 v1, v2, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v2, v4, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; CI-NEXT: v_cvt_f32_f16_e32 v5, s2 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CI-NEXT: s_lshr_b32 s11, s1, 16 +; CI-NEXT: s_lshr_b32 s14, s5, 16 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_add_f32_e32 v6, v6, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_or_b32_e32 v3, v1, v0 +; CI-NEXT: v_add_f32_e32 v1, v5, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s14 +; CI-NEXT: v_cvt_f32_f16_e32 v5, s11 +; CI-NEXT: v_cvt_f32_f16_e32 v6, s5 +; CI-NEXT: v_cvt_f32_f16_e32 v7, s1 +; CI-NEXT: s_lshr_b32 s10, s0, 16 +; CI-NEXT: s_lshr_b32 s13, s4, 16 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; CI-NEXT: v_or_b32_e32 v2, v1, v0 +; CI-NEXT: v_add_f32_e32 v0, v5, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s13 +; CI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; CI-NEXT: v_add_f32_e32 v1, v7, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; CI-NEXT: v_cvt_f32_f16_e32 v7, s0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_add_f32_e32 v4, v5, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_add_f32_e32 v5, v7, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_or_b32_e32 v1, v5, v1 -; CI-NEXT: v_or_b32_e32 v0, v4, v0 +; CI-NEXT: v_or_b32_e32 v1, v1, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; CI-NEXT: v_or_b32_e32 v0, v5, v0 ; CI-NEXT: v_mov_b32_e32 v4, s8 -; CI-NEXT: v_or_b32_e32 v3, v7, v3 -; CI-NEXT: v_or_b32_e32 v2, v6, v2 ; CI-NEXT: v_mov_b32_e32 v5, s9 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 92ea83fdfb982..c24ee53de2e76 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -2480,10 +2480,10 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: s_and_b32 s1, s4, 0xffff -; CI-NEXT: s_lshl_b32 s2, s4, 16 +; CI-NEXT: s_lshl_b32 s1, s4, 16 +; CI-NEXT: s_and_b32 s2, s4, 0xffff ; CI-NEXT: s_lshl_b32 s3, s5, 4 -; CI-NEXT: s_or_b32 s2, s1, s2 +; CI-NEXT: s_or_b32 s2, s2, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: s_lshl_b64 s[0:1], 0xffff, s3 ; CI-NEXT: v_mov_b32_e32 v4, s2 @@ -2839,60 +2839,48 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; CI-NEXT: s_cmp_eq_u32 s5, 7 +; CI-NEXT: s_cmp_eq_u32 s5, 6 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v6, s4 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 6 -; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 5 -; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 4 +; CI-NEXT: s_cmp_eq_u32 s5, 7 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 4 +; CI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 5 ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] -; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc +; CI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 2 -; CI-NEXT: v_cndmask_b32_e32 v9, v9, v6, vcc +; CI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 1 +; CI-NEXT: s_cmp_eq_u32 s5, 3 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; CI-NEXT: v_or_b32_e32 v3, v7, v3 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 0 -; CI-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc +; CI-NEXT: v_or_b32_e32 v2, v2, v7 +; CI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: s_cmp_eq_u32 s5, 1 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; CI-NEXT: v_or_b32_e32 v3, v3, v6 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 -; CI-NEXT: v_or_b32_e32 v2, v2, v7 -; CI-NEXT: v_or_b32_e32 v1, v1, v8 +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; CI-NEXT: v_or_b32_e32 v1, v1, v7 ; CI-NEXT: v_or_b32_e32 v0, v0, v6 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm @@ -3425,7 +3413,7 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; CI-LABEL: v_insertelement_v16f16_dynamic: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 +; CI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 @@ -3440,119 +3428,94 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; CI-NEXT: s_cmp_eq_u32 s5, 15 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CI-NEXT: v_add_i32_e32 v11, vcc, 16, v4 +; CI-NEXT: s_cmp_eq_u32 s7, 14 +; CI-NEXT: v_addc_u32_e32 v12, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v6, s6 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 14 -; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 13 -; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 12 +; CI-NEXT: s_cmp_eq_u32 s7, 15 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[0:1] -; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 11 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc -; CI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[2:3] +; CI-NEXT: v_cndmask_b32_e32 v13, v10, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 10 +; CI-NEXT: s_cmp_eq_u32 s7, 12 +; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CI-NEXT: s_cmp_eq_u32 s7, 13 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 ; CI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] -; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; CI-NEXT: v_or_b32_e32 v9, v9, v12 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; CI-NEXT: v_or_b32_e32 v8, v8, v12 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v15 -; CI-NEXT: s_cmp_eq_u32 s5, 9 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 8 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v16 -; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc -; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 7 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc +; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CI-NEXT: s_cmp_eq_u32 s7, 10 +; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CI-NEXT: s_cmp_eq_u32 s7, 11 +; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; CI-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] +; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CI-NEXT: s_cmp_eq_u32 s7, 8 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; CI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; CI-NEXT: v_cndmask_b32_e64 v15, v15, v6, s[2:3] +; CI-NEXT: s_cmp_eq_u32 s7, 9 +; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; CI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc +; CI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 6 -; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc +; CI-NEXT: s_cmp_eq_u32 s7, 6 +; CI-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[4:5] +; CI-NEXT: v_or_b32_e32 v8, v8, v15 +; CI-NEXT: v_cndmask_b32_e32 v15, v16, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 5 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: s_cmp_eq_u32 s7, 7 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; CI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; CI-NEXT: v_cndmask_b32_e64 v14, v14, v6, s[0:1] +; CI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; CI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 4 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc +; CI-NEXT: s_cmp_eq_u32 s7, 4 +; CI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; CI-NEXT: v_or_b32_e32 v10, v13, v10 +; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; CI-NEXT: v_or_b32_e32 v7, v7, v15 +; CI-NEXT: v_cndmask_b32_e32 v15, v17, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: s_cmp_eq_u32 s7, 5 +; CI-NEXT: v_or_b32_e32 v9, v9, v13 +; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 ; CI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; CI-NEXT: v_or_b32_e32 v10, v10, v11 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; CI-NEXT: v_or_b32_e32 v7, v7, v12 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_or_b32_e32 v3, v3, v12 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_or_b32_e32 v2, v2, v12 -; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; CI-NEXT: s_cmp_eq_u32 s5, 3 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 2 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc +; CI-NEXT: s_cmp_eq_u32 s7, 2 +; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 1 +; CI-NEXT: s_cmp_eq_u32 s7, 3 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 ; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: s_cmp_eq_u32 s5, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc +; CI-NEXT: s_cmp_eq_u32 s7, 0 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: s_cmp_eq_u32 s7, 1 +; CI-NEXT: v_or_b32_e32 v2, v2, v13 +; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; CI-NEXT: v_or_b32_e32 v1, v1, v6 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: v_cndmask_b32_e32 v6, v13, v6, vcc +; CI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; CI-NEXT: v_or_b32_e32 v3, v3, v15 +; CI-NEXT: v_or_b32_e32 v1, v1, v14 ; CI-NEXT: v_or_b32_e32 v0, v0, v6 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; CI-NEXT: s_nop 0 -; CI-NEXT: v_add_i32_e32 v0, vcc, 16, v4 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; CI-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; CI-NEXT: flat_store_dwordx4 v[11:12], v[7:10] ; CI-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: v_insertelement_v16f16_dynamic: diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index a2da8876472ab..0a53b3a906fbe 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -6187,3 +6187,167 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 ret void } + +define amdgpu_kernel void @f16_arg(half %arg, ptr addrspace(1) %ptr) { +; SI-LABEL: f16_arg: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: f16_arg: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-NEXT: s_load_dword s2, s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: f16_arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: f16_arg: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, 0.0, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT T0.W, KC0[2].Z, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.X, literal.y, +; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: LSHL T0.X, T1.W, PV.W, +; EG-NEXT: LSHL * T0.W, literal.x, PV.W, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV * T0.Z, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Z, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: f16_arg: +; CM: ; %bb.0: +; CM-NEXT: ALU 0, @8, KC0[], KC1[] +; CM-NEXT: TEX 0 @6 +; CM-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: Fetch clause starting at 6: +; CM-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3 +; CM-NEXT: ALU clause starting at 8: +; CM-NEXT: MOV * T0.X, 0.0, +; CM-NEXT: ALU clause starting at 9: +; CM-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x, +; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.X, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) +; CM-NEXT: LSHL T0.X, PV.Z, PV.W, +; CM-NEXT: LSHL * T0.W, literal.x, PV.W, +; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; CM-NEXT: MOV T0.Y, 0.0, +; CM-NEXT: MOV * T0.Z, 0.0, +; CM-NEXT: LSHR * T1.X, KC0[2].Z, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) + store half %arg, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_kernel void @v2f16_arg(<2 x half> %arg, ptr addrspace(1) %ptr) { +; SI-LABEL: v2f16_arg: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v2f16_arg: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-NEXT: s_load_dword s2, s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v2f16_arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: v2f16_arg: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @10, KC0[], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_16 T1.X, T0.X, 38, #3 +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, 0.0, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: LSHL * T0.W, T1.X, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT T0.X, T0.X, PV.W, +; EG-NEXT: LSHR * T1.X, KC0[2].Z, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: v2f16_arg: +; CM: ; %bb.0: +; CM-NEXT: ALU 0, @10, KC0[], KC1[] +; CM-NEXT: TEX 1 @6 +; CM-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: Fetch clause starting at 6: +; CM-NEXT: VTX_READ_16 T1.X, T0.X, 38, #3 +; CM-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3 +; CM-NEXT: ALU clause starting at 10: +; CM-NEXT: MOV * T0.X, 0.0, +; CM-NEXT: ALU clause starting at 11: +; CM-NEXT: LSHL * T0.W, T1.X, literal.x, +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: OR_INT * T0.X, T0.X, PV.W, +; CM-NEXT: LSHR * T1.X, KC0[2].Z, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) + store <2 x half> %arg, ptr addrspace(1) %ptr + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll index 769bf0a6458b2..4e6b0018f661b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -143,19 +143,19 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x3e22f983, v1 -; GFX6-NEXT: v_fract_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0 +; GFX6-NEXT: v_fract_f32_e32 v1, v1 +; GFX6-NEXT: v_cos_f32_e32 v1, v1 ; GFX6-NEXT: v_fract_f32_e32 v0, v0 ; GFX6-NEXT: v_cos_f32_e32 v0, v0 -; GFX6-NEXT: v_cos_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index ee01c9d0acdc7..1485e3f88f942 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -5405,26 +5405,28 @@ define float @v_exp_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-LABEL: v_exp_f32_from_fpext_math_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3fb8aa3b +; SI-SDAG-NEXT: s_mov_b32 s5, 0x32a5705f ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x32a5705f ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, v2 +; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 ; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2ce8ed0 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x42b17218 ; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x42b17218 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5575,17 +5577,13 @@ define float @v_exp_f32_from_fpext_math_f16_fast(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-LABEL: v_exp_f32_from_fpext_math_f16_fast: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2aeac50 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_add_f32_e32 v1, 0x42800000, v0 -; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x114b4ea4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_f32_from_fpext_math_f16_fast: @@ -5727,26 +5725,28 @@ define float @v_exp_f32_from_fpext_math_f16_daz(i16 %src0.i, i16 %src1.i) #0 { ; SI-SDAG-LABEL: v_exp_f32_from_fpext_math_f16_daz: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3fb8aa3b +; SI-SDAG-NEXT: s_mov_b32 s5, 0x32a5705f ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x32a5705f ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, v2 +; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 ; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2ce8ed0 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x42b17218 ; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x42b17218 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5880,23 +5880,14 @@ define half @v_exp_fneg_fabs_f16(half %in) { ; GCN-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; SI-SDAG-LABEL: v_exp_fneg_fabs_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0xbfb8aa3b, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp_fneg_fabs_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp_fneg_fabs_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_fneg_fabs_f16: ; R600: ; %bb.0: @@ -5932,23 +5923,14 @@ define half @v_exp_fneg_f16(half %in) { ; GCN-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; SI-SDAG-LABEL: v_exp_fneg_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0xbfb8aa3b, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp_fneg_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp_fneg_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_fneg_f16: ; R600: ; %bb.0: @@ -6552,8 +6534,8 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 @@ -6658,10 +6640,16 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index 7d830a9306293..c4204417362a4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -5467,26 +5467,28 @@ define float @v_exp10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-LABEL: v_exp10_f32_from_fpext_math_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x40549a78 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x33979a37 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x33979a37 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, v2 +; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 ; SI-SDAG-NEXT: s_mov_b32 s4, 0xc23369f4 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x421a209b ; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x421a209b -; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5640,20 +5642,16 @@ define float @v_exp10_f32_from_fpext_math_f16_fast(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-LABEL: v_exp10_f32_from_fpext_math_f16_fast: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0xc217b818 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_add_f32_e32 v1, 0x42000000, v0 -; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3a2784bc, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0xa4fb11f, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp10_f32_from_fpext_math_f16_fast: @@ -5798,26 +5796,28 @@ define float @v_exp10_f32_from_fpext_math_f16_daz(i16 %src0.i, i16 %src1.i) #0 { ; SI-SDAG-LABEL: v_exp10_f32_from_fpext_math_f16_daz: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x40549a78 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x33979a37 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x33979a37 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, v2 +; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 ; SI-SDAG-NEXT: s_mov_b32 s4, 0xc23369f4 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x421a209b ; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x421a209b -; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5951,23 +5951,14 @@ define half @v_exp10_fneg_fabs_f16(half %in) { ; GCN-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; SI-SDAG-LABEL: v_exp10_fneg_fabs_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0xc0549a78, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp10_fneg_fabs_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp10_fneg_fabs_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SI-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_fneg_fabs_f16: ; R600: ; %bb.0: @@ -6003,23 +5994,14 @@ define half @v_exp10_fneg_f16(half %in) { ; GCN-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; SI-SDAG-LABEL: v_exp10_fneg_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0xc0549a78, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp10_fneg_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp10_fneg_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_fneg_f16: ; R600: ; %bb.0: @@ -6691,8 +6673,8 @@ define <3 x half> @v_exp10_v3f16(<3 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549a78, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 @@ -6839,11 +6821,23 @@ define <3 x half> @v_exp10_v3f16_afn(<3 x half> %in) { ; SI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3a278000, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40548000, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a278000, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40548000, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v5, v5 -; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a278000, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40548000, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_exp_f32_e32 v5, v5 +; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v3, v3 @@ -6861,8 +6855,8 @@ define <3 x half> @v_exp10_v3f16_afn(<3 x half> %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, v2, v5 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 97ecb5362a4bc..21c7f56aa0816 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -2759,18 +2759,12 @@ define float @v_exp2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-LABEL: v_exp2_f32_from_fpext_math_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 -; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 -; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp2_f32_from_fpext_math_f16: @@ -2976,22 +2970,13 @@ define half @v_exp2_fabs_f16(half %in) { } define half @v_exp2_fneg_fabs_f16(half %in) { -; SI-SDAG-LABEL: v_exp2_fneg_fabs_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp2_fneg_fabs_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp2_fneg_fabs_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_exp2_fneg_fabs_f16: ; VI: ; %bb.0: @@ -3026,22 +3011,13 @@ define half @v_exp2_fneg_fabs_f16(half %in) { } define half @v_exp2_fneg_f16(half %in) { -; SI-SDAG-LABEL: v_exp2_fneg_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp2_fneg_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp2_fneg_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_exp2_fneg_f16: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll index 97ea988581ce3..e8bf198f89855 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -16,29 +16,74 @@ define amdgpu_kernel void @fma_f16( ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_movk_i32 s2, 0x7e00 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, v0, v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NEXT: v_readfirstlane_b32 s0, v1 +; SI-NEXT: s_and_b32 s1, s0, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s1, v0 +; SI-NEXT: s_lshr_b32 s3, s0, 8 +; SI-NEXT: s_bfe_u32 s4, s0, 0xb0014 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b32 s1, s3, 0xffe +; SI-NEXT: s_sub_i32 s3, 0x3f1, s4 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_med3_i32 v1, s3, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s3, v0 +; SI-NEXT: s_or_b32 s1, s1, s3 +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: s_or_b32 s3, s1, 0x1000 +; SI-NEXT: s_lshr_b32 s6, s3, s5 +; SI-NEXT: s_lshl_b32 s5, s6, s5 +; SI-NEXT: s_cmp_lg_u32 s5, s3 +; SI-NEXT: s_cselect_b32 s3, 1, 0 +; SI-NEXT: s_addk_i32 s4, 0xfc10 +; SI-NEXT: s_lshl_b32 s5, s4, 12 +; SI-NEXT: s_or_b32 s3, s6, s3 +; SI-NEXT: s_or_b32 s5, s1, s5 +; SI-NEXT: s_cmp_lt_i32 s4, 1 +; SI-NEXT: s_cselect_b32 s3, s3, s5 +; SI-NEXT: s_and_b32 s5, s3, 7 +; SI-NEXT: s_cmp_gt_i32 s5, 5 +; SI-NEXT: s_cselect_b32 s6, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s5, 3 +; SI-NEXT: s_cselect_b32 s5, 1, 0 +; SI-NEXT: s_lshr_b32 s3, s3, 2 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s3, s3, s5 +; SI-NEXT: s_cmp_lt_i32 s4, 31 +; SI-NEXT: s_cselect_b32 s3, s3, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s1, 0 +; SI-NEXT: s_cselect_b32 s1, s2, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s4, 0x40f +; SI-NEXT: s_cselect_b32 s1, s1, s3 +; SI-NEXT: s_lshr_b32 s0, s0, 16 +; SI-NEXT: s_and_b32 s0, s0, 0x8000 +; SI-NEXT: s_or_b32 s0, s0, s1 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -112,29 +157,74 @@ define amdgpu_kernel void @fma_f16( define amdgpu_kernel void @fma_f16_imm_a( ; SI-LABEL: fma_f16_imm_a: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s2, 0x40400000 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s12, s10 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s5, 0x40080000 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_movk_i32 s6, 0x7e00 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_fma_f32 v0, v0, s2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1] +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b32 s5, s4, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s5, v0 +; SI-NEXT: s_lshr_b32 s7, s4, 8 +; SI-NEXT: s_bfe_u32 s8, s4, 0xb0014 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b32 s5, s7, 0xffe +; SI-NEXT: s_sub_i32 s7, 0x3f1, s8 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_med3_i32 v1, s7, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: s_or_b32 s7, s5, 0x1000 +; SI-NEXT: s_lshr_b32 s10, s7, s9 +; SI-NEXT: s_lshl_b32 s9, s10, s9 +; SI-NEXT: s_cmp_lg_u32 s9, s7 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_addk_i32 s8, 0xfc10 +; SI-NEXT: s_lshl_b32 s9, s8, 12 +; SI-NEXT: s_or_b32 s7, s10, s7 +; SI-NEXT: s_or_b32 s9, s5, s9 +; SI-NEXT: s_cmp_lt_i32 s8, 1 +; SI-NEXT: s_cselect_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s7, 7 +; SI-NEXT: s_cmp_gt_i32 s9, 5 +; SI-NEXT: s_cselect_b32 s10, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s9, 3 +; SI-NEXT: s_cselect_b32 s9, 1, 0 +; SI-NEXT: s_lshr_b32 s7, s7, 2 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_add_i32 s7, s7, s9 +; SI-NEXT: s_cmp_lt_i32 s8, 31 +; SI-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fma_f16_imm_a: @@ -195,29 +285,74 @@ define amdgpu_kernel void @fma_f16_imm_a( define amdgpu_kernel void @fma_f16_imm_b( ; SI-LABEL: fma_f16_imm_b: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s2, 0x40400000 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s12, s10 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s5, 0x40080000 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_movk_i32 s6, 0x7e00 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_fma_f32 v0, v0, s2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1] +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b32 s5, s4, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s5, v0 +; SI-NEXT: s_lshr_b32 s7, s4, 8 +; SI-NEXT: s_bfe_u32 s8, s4, 0xb0014 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b32 s5, s7, 0xffe +; SI-NEXT: s_sub_i32 s7, 0x3f1, s8 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_med3_i32 v1, s7, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: s_or_b32 s7, s5, 0x1000 +; SI-NEXT: s_lshr_b32 s10, s7, s9 +; SI-NEXT: s_lshl_b32 s9, s10, s9 +; SI-NEXT: s_cmp_lg_u32 s9, s7 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_addk_i32 s8, 0xfc10 +; SI-NEXT: s_lshl_b32 s9, s8, 12 +; SI-NEXT: s_or_b32 s7, s10, s7 +; SI-NEXT: s_or_b32 s9, s5, s9 +; SI-NEXT: s_cmp_lt_i32 s8, 1 +; SI-NEXT: s_cselect_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s7, 7 +; SI-NEXT: s_cmp_gt_i32 s9, 5 +; SI-NEXT: s_cselect_b32 s10, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s9, 3 +; SI-NEXT: s_cselect_b32 s9, 1, 0 +; SI-NEXT: s_lshr_b32 s7, s7, 2 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_add_i32 s7, s7, s9 +; SI-NEXT: s_cmp_lt_i32 s8, 31 +; SI-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fma_f16_imm_b: @@ -278,29 +413,74 @@ define amdgpu_kernel void @fma_f16_imm_b( define amdgpu_kernel void @fma_f16_imm_c( ; SI-LABEL: fma_f16_imm_c: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s2, 0x40400000 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s12, s10 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s5, 0x40080000 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_movk_i32 s6, 0x7e00 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_fma_f32 v0, v0, v1, s2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], s[4:5] +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b32 s5, s4, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s5, v0 +; SI-NEXT: s_lshr_b32 s7, s4, 8 +; SI-NEXT: s_bfe_u32 s8, s4, 0xb0014 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b32 s5, s7, 0xffe +; SI-NEXT: s_sub_i32 s7, 0x3f1, s8 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_med3_i32 v1, s7, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: s_or_b32 s7, s5, 0x1000 +; SI-NEXT: s_lshr_b32 s10, s7, s9 +; SI-NEXT: s_lshl_b32 s9, s10, s9 +; SI-NEXT: s_cmp_lg_u32 s9, s7 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_addk_i32 s8, 0xfc10 +; SI-NEXT: s_lshl_b32 s9, s8, 12 +; SI-NEXT: s_or_b32 s7, s10, s7 +; SI-NEXT: s_or_b32 s9, s5, s9 +; SI-NEXT: s_cmp_lt_i32 s8, 1 +; SI-NEXT: s_cselect_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s7, 7 +; SI-NEXT: s_cmp_gt_i32 s9, 5 +; SI-NEXT: s_cselect_b32 s10, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s9, 3 +; SI-NEXT: s_cselect_b32 s9, 1, 0 +; SI-NEXT: s_lshr_b32 s7, s7, 2 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_add_i32 s7, s7, s9 +; SI-NEXT: s_cmp_lt_i32 s8, 31 +; SI-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fma_f16_imm_c: @@ -376,30 +556,119 @@ define amdgpu_kernel void @fma_v2f16( ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_movk_i32 s2, 0x7e00 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, v0, v4, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_fma_f32 v1, v3, v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v6 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v8 +; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v9 +; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; SI-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NEXT: v_readfirstlane_b32 s0, v1 +; SI-NEXT: s_and_b32 s1, s0, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s1, v0 +; SI-NEXT: s_lshr_b32 s3, s0, 8 +; SI-NEXT: s_bfe_u32 s4, s0, 0xb0014 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b32 s1, s3, 0xffe +; SI-NEXT: s_sub_i32 s3, 0x3f1, s4 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_med3_i32 v1, s3, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s3, v0 +; SI-NEXT: s_or_b32 s1, s1, s3 +; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: s_or_b32 s3, s1, 0x1000 +; SI-NEXT: s_lshr_b32 s7, s3, s6 +; SI-NEXT: s_lshl_b32 s6, s7, s6 +; SI-NEXT: s_cmp_lg_u32 s6, s3 +; SI-NEXT: s_cselect_b32 s3, 1, 0 +; SI-NEXT: s_addk_i32 s4, 0xfc10 +; SI-NEXT: s_lshl_b32 s6, s4, 12 +; SI-NEXT: s_or_b32 s3, s7, s3 +; SI-NEXT: s_or_b32 s6, s1, s6 +; SI-NEXT: s_cmp_lt_i32 s4, 1 +; SI-NEXT: s_cselect_b32 s3, s3, s6 +; SI-NEXT: s_and_b32 s6, s3, 7 +; SI-NEXT: s_cmp_gt_i32 s6, 5 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s6, 3 +; SI-NEXT: s_cselect_b32 s6, 1, 0 +; SI-NEXT: s_lshr_b32 s3, s3, 2 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s3, s3, s6 +; SI-NEXT: v_fma_f64 v[2:3], v[10:11], v[8:9], v[6:7] +; SI-NEXT: s_cmp_lt_i32 s4, 31 +; SI-NEXT: s_cselect_b32 s3, s3, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s1, 0 +; SI-NEXT: v_readfirstlane_b32 s5, v3 +; SI-NEXT: s_cselect_b32 s1, s2, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s4, 0x40f +; SI-NEXT: s_cselect_b32 s1, s1, s3 +; SI-NEXT: s_and_b32 s3, s5, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s3, v2 +; SI-NEXT: s_lshr_b32 s0, s0, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_lshr_b32 s4, s5, 8 +; SI-NEXT: s_bfe_u32 s6, s5, 0xb0014 +; SI-NEXT: s_and_b32 s0, s0, 0x8000 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: s_and_b32 s3, s4, 0xffe +; SI-NEXT: s_sub_i32 s4, 0x3f1, s6 +; SI-NEXT: s_or_b32 s0, s0, s1 +; SI-NEXT: v_readfirstlane_b32 s1, v0 +; SI-NEXT: v_med3_i32 v1, s4, 0, 13 +; SI-NEXT: s_or_b32 s1, s3, s1 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_or_b32 s3, s1, 0x1000 +; SI-NEXT: s_lshr_b32 s7, s3, s4 +; SI-NEXT: s_and_b32 s0, s0, 0xffff +; SI-NEXT: s_lshl_b32 s4, s7, s4 +; SI-NEXT: s_cmp_lg_u32 s4, s3 +; SI-NEXT: s_cselect_b32 s3, 1, 0 +; SI-NEXT: s_addk_i32 s6, 0xfc10 +; SI-NEXT: s_lshl_b32 s4, s6, 12 +; SI-NEXT: s_or_b32 s3, s7, s3 +; SI-NEXT: s_or_b32 s4, s1, s4 +; SI-NEXT: s_cmp_lt_i32 s6, 1 +; SI-NEXT: s_cselect_b32 s3, s3, s4 +; SI-NEXT: s_and_b32 s4, s3, 7 +; SI-NEXT: s_cmp_gt_i32 s4, 5 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s4, 3 +; SI-NEXT: s_cselect_b32 s4, 1, 0 +; SI-NEXT: s_lshr_b32 s3, s3, 2 +; SI-NEXT: s_or_b32 s4, s4, s7 +; SI-NEXT: s_add_i32 s3, s3, s4 +; SI-NEXT: s_cmp_lt_i32 s6, 31 +; SI-NEXT: s_cselect_b32 s3, s3, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s1, 0 +; SI-NEXT: s_cselect_b32 s1, s2, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s6, 0x40f +; SI-NEXT: s_cselect_b32 s1, s1, s3 +; SI-NEXT: s_lshr_b32 s2, s5, 16 +; SI-NEXT: s_and_b32 s2, s2, 0x8000 +; SI-NEXT: s_or_b32 s1, s2, s1 +; SI-NEXT: s_lshl_b32 s1, s1, 16 +; SI-NEXT: s_or_b32 s0, s0, s1 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -481,37 +750,125 @@ define amdgpu_kernel void @fma_v2f16( define amdgpu_kernel void @fma_v2f16_imm_a( ; SI-LABEL: fma_v2f16_imm_a: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s14, s6 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s12, s10 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, 0x40400000 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s5, 0x40080000 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_movk_i32 s6, 0x7e00 +; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_fma_f32 v2, v3, s2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, v1, s2, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v4 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1] +; SI-NEXT: v_fma_f64 v[2:3], v[6:7], s[4:5], v[4:5] +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b32 s5, s4, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s5, v0 +; SI-NEXT: s_lshr_b32 s7, s4, 8 +; SI-NEXT: s_bfe_u32 s8, s4, 0xb0014 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b32 s5, s7, 0xffe +; SI-NEXT: s_sub_i32 s7, 0x3f1, s8 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_med3_i32 v1, s7, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: v_readfirstlane_b32 s10, v1 +; SI-NEXT: s_or_b32 s7, s5, 0x1000 +; SI-NEXT: s_lshr_b32 s11, s7, s10 +; SI-NEXT: s_lshl_b32 s10, s11, s10 +; SI-NEXT: s_cmp_lg_u32 s10, s7 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_addk_i32 s8, 0xfc10 +; SI-NEXT: s_lshl_b32 s10, s8, 12 +; SI-NEXT: s_or_b32 s7, s11, s7 +; SI-NEXT: s_or_b32 s10, s5, s10 +; SI-NEXT: s_cmp_lt_i32 s8, 1 +; SI-NEXT: s_cselect_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s10, s7, 7 +; SI-NEXT: s_cmp_gt_i32 s10, 5 +; SI-NEXT: s_cselect_b32 s11, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s10, 3 +; SI-NEXT: s_cselect_b32 s10, 1, 0 +; SI-NEXT: s_lshr_b32 s7, s7, 2 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_add_i32 s7, s7, s10 +; SI-NEXT: s_cmp_lt_i32 s8, 31 +; SI-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s9, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s7, v2 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_lshr_b32 s8, s9, 8 +; SI-NEXT: s_bfe_u32 s10, s9, 0xb0014 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: s_and_b32 s7, s8, 0xffe +; SI-NEXT: s_sub_i32 s8, 0x3f1, s10 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readfirstlane_b32 s5, v0 +; SI-NEXT: v_med3_i32 v1, s8, 0, 13 +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: v_readfirstlane_b32 s8, v1 +; SI-NEXT: s_or_b32 s7, s5, 0x1000 +; SI-NEXT: s_lshr_b32 s11, s7, s8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s8, s11, s8 +; SI-NEXT: s_cmp_lg_u32 s8, s7 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_addk_i32 s10, 0xfc10 +; SI-NEXT: s_lshl_b32 s8, s10, 12 +; SI-NEXT: s_or_b32 s7, s11, s7 +; SI-NEXT: s_or_b32 s8, s5, s8 +; SI-NEXT: s_cmp_lt_i32 s10, 1 +; SI-NEXT: s_cselect_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s7, 7 +; SI-NEXT: s_cmp_gt_i32 s8, 5 +; SI-NEXT: s_cselect_b32 s11, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 3 +; SI-NEXT: s_cselect_b32 s8, 1, 0 +; SI-NEXT: s_lshr_b32 s7, s7, 2 +; SI-NEXT: s_or_b32 s8, s8, s11 +; SI-NEXT: s_add_i32 s7, s7, s8 +; SI-NEXT: s_cmp_lt_i32 s10, 31 +; SI-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s10, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_lshr_b32 s6, s9, 16 +; SI-NEXT: s_and_b32 s6, s6, 0x8000 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fma_v2f16_imm_a: @@ -578,37 +935,125 @@ define amdgpu_kernel void @fma_v2f16_imm_a( define amdgpu_kernel void @fma_v2f16_imm_b( ; SI-LABEL: fma_v2f16_imm_b: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s14, s6 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s12, s10 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, 0x40400000 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s5, 0x40080000 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_movk_i32 s6, 0x7e00 +; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_fma_f32 v2, v3, s2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, v1, s2, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v4 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1] +; SI-NEXT: v_fma_f64 v[2:3], v[6:7], s[4:5], v[4:5] +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b32 s5, s4, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s5, v0 +; SI-NEXT: s_lshr_b32 s7, s4, 8 +; SI-NEXT: s_bfe_u32 s8, s4, 0xb0014 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b32 s5, s7, 0xffe +; SI-NEXT: s_sub_i32 s7, 0x3f1, s8 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_med3_i32 v1, s7, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: v_readfirstlane_b32 s10, v1 +; SI-NEXT: s_or_b32 s7, s5, 0x1000 +; SI-NEXT: s_lshr_b32 s11, s7, s10 +; SI-NEXT: s_lshl_b32 s10, s11, s10 +; SI-NEXT: s_cmp_lg_u32 s10, s7 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_addk_i32 s8, 0xfc10 +; SI-NEXT: s_lshl_b32 s10, s8, 12 +; SI-NEXT: s_or_b32 s7, s11, s7 +; SI-NEXT: s_or_b32 s10, s5, s10 +; SI-NEXT: s_cmp_lt_i32 s8, 1 +; SI-NEXT: s_cselect_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s10, s7, 7 +; SI-NEXT: s_cmp_gt_i32 s10, 5 +; SI-NEXT: s_cselect_b32 s11, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s10, 3 +; SI-NEXT: s_cselect_b32 s10, 1, 0 +; SI-NEXT: s_lshr_b32 s7, s7, 2 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_add_i32 s7, s7, s10 +; SI-NEXT: s_cmp_lt_i32 s8, 31 +; SI-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s9, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s7, v2 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_lshr_b32 s8, s9, 8 +; SI-NEXT: s_bfe_u32 s10, s9, 0xb0014 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: s_and_b32 s7, s8, 0xffe +; SI-NEXT: s_sub_i32 s8, 0x3f1, s10 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readfirstlane_b32 s5, v0 +; SI-NEXT: v_med3_i32 v1, s8, 0, 13 +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: v_readfirstlane_b32 s8, v1 +; SI-NEXT: s_or_b32 s7, s5, 0x1000 +; SI-NEXT: s_lshr_b32 s11, s7, s8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s8, s11, s8 +; SI-NEXT: s_cmp_lg_u32 s8, s7 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_addk_i32 s10, 0xfc10 +; SI-NEXT: s_lshl_b32 s8, s10, 12 +; SI-NEXT: s_or_b32 s7, s11, s7 +; SI-NEXT: s_or_b32 s8, s5, s8 +; SI-NEXT: s_cmp_lt_i32 s10, 1 +; SI-NEXT: s_cselect_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s7, 7 +; SI-NEXT: s_cmp_gt_i32 s8, 5 +; SI-NEXT: s_cselect_b32 s11, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 3 +; SI-NEXT: s_cselect_b32 s8, 1, 0 +; SI-NEXT: s_lshr_b32 s7, s7, 2 +; SI-NEXT: s_or_b32 s8, s8, s11 +; SI-NEXT: s_add_i32 s7, s7, s8 +; SI-NEXT: s_cmp_lt_i32 s10, 31 +; SI-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s10, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_lshr_b32 s6, s9, 16 +; SI-NEXT: s_and_b32 s6, s6, 0x8000 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fma_v2f16_imm_b: @@ -675,37 +1120,125 @@ define amdgpu_kernel void @fma_v2f16_imm_b( define amdgpu_kernel void @fma_v2f16_imm_c( ; SI-LABEL: fma_v2f16_imm_c: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s14, s6 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s12, s10 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, 0x40400000 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s5, 0x40080000 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_movk_i32 s6, 0x7e00 +; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_fma_f32 v2, v3, v2, s2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, v1, v0, s2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v4 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], s[4:5] +; SI-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], s[4:5] +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b32 s5, s4, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s5, v0 +; SI-NEXT: s_lshr_b32 s7, s4, 8 +; SI-NEXT: s_bfe_u32 s8, s4, 0xb0014 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b32 s5, s7, 0xffe +; SI-NEXT: s_sub_i32 s7, 0x3f1, s8 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_med3_i32 v1, s7, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: v_readfirstlane_b32 s10, v1 +; SI-NEXT: s_or_b32 s7, s5, 0x1000 +; SI-NEXT: s_lshr_b32 s11, s7, s10 +; SI-NEXT: s_lshl_b32 s10, s11, s10 +; SI-NEXT: s_cmp_lg_u32 s10, s7 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_addk_i32 s8, 0xfc10 +; SI-NEXT: s_lshl_b32 s10, s8, 12 +; SI-NEXT: s_or_b32 s7, s11, s7 +; SI-NEXT: s_or_b32 s10, s5, s10 +; SI-NEXT: s_cmp_lt_i32 s8, 1 +; SI-NEXT: s_cselect_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s10, s7, 7 +; SI-NEXT: s_cmp_gt_i32 s10, 5 +; SI-NEXT: s_cselect_b32 s11, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s10, 3 +; SI-NEXT: s_cselect_b32 s10, 1, 0 +; SI-NEXT: s_lshr_b32 s7, s7, 2 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_add_i32 s7, s7, s10 +; SI-NEXT: s_cmp_lt_i32 s8, 31 +; SI-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s9, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s7, v2 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_lshr_b32 s8, s9, 8 +; SI-NEXT: s_bfe_u32 s10, s9, 0xb0014 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: s_and_b32 s7, s8, 0xffe +; SI-NEXT: s_sub_i32 s8, 0x3f1, s10 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readfirstlane_b32 s5, v0 +; SI-NEXT: v_med3_i32 v1, s8, 0, 13 +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: v_readfirstlane_b32 s8, v1 +; SI-NEXT: s_or_b32 s7, s5, 0x1000 +; SI-NEXT: s_lshr_b32 s11, s7, s8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s8, s11, s8 +; SI-NEXT: s_cmp_lg_u32 s8, s7 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_addk_i32 s10, 0xfc10 +; SI-NEXT: s_lshl_b32 s8, s10, 12 +; SI-NEXT: s_or_b32 s7, s11, s7 +; SI-NEXT: s_or_b32 s8, s5, s8 +; SI-NEXT: s_cmp_lt_i32 s10, 1 +; SI-NEXT: s_cselect_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s7, 7 +; SI-NEXT: s_cmp_gt_i32 s8, 5 +; SI-NEXT: s_cselect_b32 s11, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 3 +; SI-NEXT: s_cselect_b32 s8, 1, 0 +; SI-NEXT: s_lshr_b32 s7, s7, 2 +; SI-NEXT: s_or_b32 s8, s8, s11 +; SI-NEXT: s_add_i32 s7, s7, s8 +; SI-NEXT: s_cmp_lt_i32 s10, 31 +; SI-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s10, 0x40f +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_lshr_b32 s6, s9, 16 +; SI-NEXT: s_and_b32 s6, s6, 0x8000 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fma_v2f16_imm_c: @@ -787,45 +1320,222 @@ define amdgpu_kernel void @fma_v4f16( ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dwordx2 v[5:6], off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 -; SI-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_movk_i32 s2, 0x7e00 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v5 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 +; SI-NEXT: v_cvt_f64_f32_e32 v[5:6], v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[7:8], v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[9:10], v10 +; SI-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 +; SI-NEXT: v_cvt_f64_f32_e32 v[13:14], v13 +; SI-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 +; SI-NEXT: v_fma_f64 v[5:6], v[9:10], v[7:8], v[5:6] +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_readfirstlane_b32 s0, v6 +; SI-NEXT: v_fma_f64 v[2:3], v[15:16], v[13:14], v[11:12] +; SI-NEXT: s_and_b32 s3, s0, 0x1ff +; SI-NEXT: v_readfirstlane_b32 s1, v3 +; SI-NEXT: v_or_b32_e32 v3, s3, v5 +; SI-NEXT: s_lshr_b32 s4, s0, 8 +; SI-NEXT: s_bfe_u32 s5, s0, 0xb0014 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: s_and_b32 s3, s4, 0xffe +; SI-NEXT: s_sub_i32 s4, 0x3f1, s5 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: v_med3_i32 v5, s4, 0, 13 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: s_or_b32 s3, s3, s4 +; SI-NEXT: v_readfirstlane_b32 s7, v5 +; SI-NEXT: s_or_b32 s4, s3, 0x1000 +; SI-NEXT: s_lshr_b32 s12, s4, s7 +; SI-NEXT: s_lshl_b32 s7, s12, s7 +; SI-NEXT: s_cmp_lg_u32 s7, s4 +; SI-NEXT: s_cselect_b32 s4, 1, 0 +; SI-NEXT: s_addk_i32 s5, 0xfc10 +; SI-NEXT: s_lshl_b32 s7, s5, 12 +; SI-NEXT: s_or_b32 s4, s12, s4 +; SI-NEXT: s_or_b32 s7, s3, s7 +; SI-NEXT: s_cmp_lt_i32 s5, 1 +; SI-NEXT: s_cselect_b32 s4, s4, s7 +; SI-NEXT: s_and_b32 s7, s4, 7 +; SI-NEXT: s_cmp_gt_i32 s7, 5 +; SI-NEXT: v_cvt_f64_f32_e32 v[7:8], v17 +; SI-NEXT: v_cvt_f64_f32_e32 v[9:10], v18 +; SI-NEXT: v_cvt_f64_f32_e32 v[17:18], v19 +; SI-NEXT: s_cselect_b32 s12, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s7, 3 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_lshr_b32 s4, s4, 2 +; SI-NEXT: s_or_b32 s7, s7, s12 +; SI-NEXT: s_add_i32 s4, s4, s7 +; SI-NEXT: v_fma_f64 v[7:8], v[17:18], v[9:10], v[7:8] +; SI-NEXT: s_cmp_lt_i32 s5, 31 +; SI-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s3, 0 +; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: s_cselect_b32 s3, s2, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s5, 0x40f +; SI-NEXT: s_cselect_b32 s3, s3, s4 +; SI-NEXT: s_and_b32 s4, s6, 0x1ff +; SI-NEXT: v_or_b32_e32 v3, s4, v7 +; SI-NEXT: s_lshr_b32 s0, s0, 16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: s_lshr_b32 s5, s6, 8 +; SI-NEXT: s_bfe_u32 s7, s6, 0xb0014 +; SI-NEXT: s_and_b32 s0, s0, 0x8000 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: s_and_b32 s4, s5, 0xffe +; SI-NEXT: s_sub_i32 s5, 0x3f1, s7 +; SI-NEXT: s_or_b32 s0, s0, s3 +; SI-NEXT: v_readfirstlane_b32 s3, v3 +; SI-NEXT: v_med3_i32 v5, s5, 0, 13 +; SI-NEXT: s_or_b32 s3, s4, s3 +; SI-NEXT: v_readfirstlane_b32 s5, v5 +; SI-NEXT: s_or_b32 s4, s3, 0x1000 +; SI-NEXT: s_lshr_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s0, s0, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, s5 +; SI-NEXT: s_cmp_lg_u32 s5, s4 +; SI-NEXT: s_cselect_b32 s4, 1, 0 +; SI-NEXT: s_addk_i32 s7, 0xfc10 +; SI-NEXT: s_lshl_b32 s5, s7, 12 +; SI-NEXT: s_or_b32 s4, s12, s4 +; SI-NEXT: s_or_b32 s5, s3, s5 +; SI-NEXT: s_cmp_lt_i32 s7, 1 +; SI-NEXT: s_cselect_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s4, 7 +; SI-NEXT: s_cmp_gt_i32 s5, 5 +; SI-NEXT: s_cselect_b32 s12, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s5, 3 +; SI-NEXT: s_cselect_b32 s5, 1, 0 +; SI-NEXT: s_lshr_b32 s4, s4, 2 +; SI-NEXT: s_or_b32 s5, s5, s12 +; SI-NEXT: s_add_i32 s4, s4, s5 +; SI-NEXT: s_cmp_lt_i32 s7, 31 +; SI-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s3, 0 +; SI-NEXT: s_cselect_b32 s3, s2, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s7, 0x40f +; SI-NEXT: s_cselect_b32 s3, s3, s4 +; SI-NEXT: s_and_b32 s5, s1, 0x1ff +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_or_b32_e32 v2, s5, v2 +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_lshr_b32 s6, s1, 8 +; SI-NEXT: s_bfe_u32 s7, s1, 0xb0014 +; SI-NEXT: s_or_b32 s3, s4, s3 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: s_and_b32 s5, s6, 0xffe +; SI-NEXT: s_sub_i32 s6, 0x3f1, s7 +; SI-NEXT: s_lshl_b32 s3, s3, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: v_med3_i32 v3, s6, 0, 13 +; SI-NEXT: s_or_b32 s0, s0, s3 +; SI-NEXT: s_or_b32 s3, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: s_or_b32 s4, s3, 0x1000 +; SI-NEXT: s_lshr_b32 s5, s4, s6 +; SI-NEXT: s_lshl_b32 s6, s5, s6 +; SI-NEXT: s_cmp_lg_u32 s6, s4 +; SI-NEXT: s_cselect_b32 s4, 1, 0 +; SI-NEXT: s_addk_i32 s7, 0xfc10 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_lshl_b32 s5, s7, 12 +; SI-NEXT: s_or_b32 s5, s3, s5 +; SI-NEXT: s_cmp_lt_i32 s7, 1 +; SI-NEXT: s_cselect_b32 s4, s4, s5 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s5, s4, 7 +; SI-NEXT: s_cmp_gt_i32 s5, 5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_fma_f32 v7, v7, v9, v11 -; SI-NEXT: v_fma_f32 v6, v6, v8, v10 -; SI-NEXT: v_fma_f32 v1, v1, v3, v5 -; SI-NEXT: v_fma_f32 v0, v0, v2, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: s_cselect_b32 s6, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s5, 3 +; SI-NEXT: s_cselect_b32 s5, 1, 0 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_lshr_b32 s4, s4, 2 +; SI-NEXT: s_add_i32 s4, s4, s5 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: s_cmp_lt_i32 s7, 31 +; SI-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s3, 0 +; SI-NEXT: s_cselect_b32 s3, s2, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s7, 0x40f +; SI-NEXT: s_cselect_b32 s3, s3, s4 +; SI-NEXT: s_lshr_b32 s1, s1, 16 +; SI-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NEXT: s_and_b32 s1, s1, 0x8000 +; SI-NEXT: s_or_b32 s1, s1, s3 +; SI-NEXT: v_readfirstlane_b32 s3, v1 +; SI-NEXT: s_and_b32 s4, s3, 0x1ff +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: s_lshr_b32 s4, s3, 8 +; SI-NEXT: s_bfe_u32 s6, s3, 0xb0014 +; SI-NEXT: s_and_b32 s4, s4, 0xffe +; SI-NEXT: v_readfirstlane_b32 s5, v0 +; SI-NEXT: s_sub_i32 s7, 0x3f1, s6 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_med3_i32 v0, s7, 0, 13 +; SI-NEXT: s_or_b32 s5, s4, 0x1000 +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: s_lshr_b32 s12, s5, s7 +; SI-NEXT: s_and_b32 s1, s1, 0xffff +; SI-NEXT: s_lshl_b32 s7, s12, s7 +; SI-NEXT: s_cmp_lg_u32 s7, s5 +; SI-NEXT: s_cselect_b32 s5, 1, 0 +; SI-NEXT: s_addk_i32 s6, 0xfc10 +; SI-NEXT: s_lshl_b32 s7, s6, 12 +; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: s_or_b32 s7, s4, s7 +; SI-NEXT: s_cmp_lt_i32 s6, 1 +; SI-NEXT: s_cselect_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s5, 7 +; SI-NEXT: s_cmp_gt_i32 s7, 5 +; SI-NEXT: s_cselect_b32 s12, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s7, 3 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_or_b32 s7, s7, s12 +; SI-NEXT: s_lshr_b32 s5, s5, 2 +; SI-NEXT: s_add_i32 s5, s5, s7 +; SI-NEXT: s_cmp_lt_i32 s6, 31 +; SI-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cselect_b32 s2, s2, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s6, 0x40f +; SI-NEXT: s_cselect_b32 s2, s2, s5 +; SI-NEXT: s_lshr_b32 s3, s3, 16 +; SI-NEXT: s_and_b32 s3, s3, 0x8000 +; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: s_lshl_b32 s2, s2, 16 +; SI-NEXT: s_or_b32 s1, s1, s2 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index 385d76bc42bda..a32a456c102dd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -19,32 +19,33 @@ define amdgpu_kernel void @fmuladd_f16( ; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -302,17 +303,20 @@ define amdgpu_kernel void @fmuladd_f16_imm_a( ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_madmk_f32 v0, v0, 0x40400000, v1 +; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -534,17 +538,20 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_madmk_f32 v0, v0, 0x40400000, v1 +; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -769,33 +776,39 @@ define amdgpu_kernel void @fmuladd_v2f16( ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, v3, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_mac_f32_e32 v5, v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_mac_f32_e32 v2, v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll index 8c4d4788c4bdf..43cc632d3708b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll @@ -650,16 +650,16 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) { ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v3, v5, v3 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v2, v5, v3 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v4 ; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -816,13 +816,13 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) { ; GFX6-SDAG-NEXT: v_ashrrev_i32_e32 v5, 16, v2 ; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v4, v4, v5 ; GFX6-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 ; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -935,23 +935,23 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) { ; GFX6-SDAG-LABEL: test_ldexp_v4f16_v4i32: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v2, v6, v3 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v5, v6, v5 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v7 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v4 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v4, v7, v5 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v3, v6, v3 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v4 ; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX6-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1134,26 +1134,26 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) { ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-SDAG-NEXT: v_ashrrev_i32_e32 v4, 16, v2 -; GFX6-SDAG-NEXT: v_ashrrev_i32_e32 v6, 16, v3 +; GFX6-SDAG-NEXT: v_ashrrev_i32_e32 v7, 16, v3 ; GFX6-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v4, v5, v4 ; GFX6-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v6, v7, v6 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v6, v6, v7 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v4, v5, v4 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX6-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index 4e8ffdcb00310..7903ae93d770c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -5901,26 +5901,21 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-LABEL: v_log_f32_from_fpext_math_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; SI-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf +; SI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log_f32_from_fpext_math_f16: @@ -6427,24 +6422,14 @@ define half @v_log_fabs_f16(half %in) { } define half @v_log_fneg_fabs_f16(half %in) { -; SI-SDAG-LABEL: v_log_fneg_fabs_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log_fneg_fabs_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log_fneg_fabs_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log_fneg_fabs_f16: ; VI: ; %bb.0: @@ -6508,24 +6493,14 @@ define half @v_log_fneg_fabs_f16(half %in) { } define half @v_log_fneg_f16(half %in) { -; SI-SDAG-LABEL: v_log_fneg_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log_fneg_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log_fneg_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log_fneg_f16: ; VI: ; %bb.0: @@ -7571,27 +7546,27 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) { ; SI-SDAG-LABEL: v_log_v4f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -7773,27 +7748,27 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) { ; SI-SDAG-LABEL: v_log_v4f16_fast: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 843b829f28742..478580ff8ec0a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -5901,26 +5901,21 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-LABEL: v_log10_f32_from_fpext_math_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; SI-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf +; SI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log10_f32_from_fpext_math_f16: @@ -6427,24 +6422,14 @@ define half @v_log10_fabs_f16(half %in) { } define half @v_log10_fneg_fabs_f16(half %in) { -; SI-SDAG-LABEL: v_log10_fneg_fabs_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log10_fneg_fabs_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log10_fneg_fabs_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log10_fneg_fabs_f16: ; VI: ; %bb.0: @@ -6508,24 +6493,14 @@ define half @v_log10_fneg_fabs_f16(half %in) { } define half @v_log10_fneg_f16(half %in) { -; SI-SDAG-LABEL: v_log10_fneg_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log10_fneg_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log10_fneg_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log10_fneg_f16: ; VI: ; %bb.0: @@ -7571,27 +7546,27 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) { ; SI-SDAG-LABEL: v_log10_v4f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -7773,27 +7748,27 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) { ; SI-SDAG-LABEL: v_log10_v4f16_fast: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 35ae1337d8e76..8401e05b39c19 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -3525,17 +3525,12 @@ define float @v_log2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-LABEL: v_log2_f32_from_fpext_math_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_f32_from_fpext_math_f16: @@ -3855,22 +3850,13 @@ define half @v_log2_fabs_f16(half %in) { } define half @v_log2_fneg_fabs_f16(half %in) { -; SI-SDAG-LABEL: v_log2_fneg_fabs_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log2_fneg_fabs_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log2_fneg_fabs_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log2_fneg_fabs_f16: ; VI: ; %bb.0: @@ -3924,22 +3910,13 @@ define half @v_log2_fneg_fabs_f16(half %in) { } define half @v_log2_fneg_f16(half %in) { -; SI-SDAG-LABEL: v_log2_fneg_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log2_fneg_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log2_fneg_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log2_fneg_f16: ; VI: ; %bb.0: @@ -4784,23 +4761,23 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_v4f16: @@ -4936,23 +4913,23 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_v4f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 3c27adde10b78..48f6c96df139d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -315,6 +315,8 @@ define half @v_maximum_f16__nnan_src0(half %arg0, half %src1) { ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -413,6 +415,8 @@ define half @v_maximum_f16__nnan_src1(half %src0, half %arg1) { ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -624,15 +628,15 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX7-LABEL: v_maximum_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX7-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_max_f32_e32 v4, v2, v3 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc ; GFX7-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 @@ -733,13 +737,13 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) { ; GFX7-LABEL: v_maximum_v2f16__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -790,15 +794,15 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX7-LABEL: v_maximum_v2f16__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX7-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_max_f32_e32 v4, v2, v3 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc ; GFX7-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 @@ -899,13 +903,13 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1) ; GFX7-LABEL: v_maximum_v2f16__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -956,20 +960,20 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX7-LABEL: s_maximum_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s4, s17, 16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, s4 ; GFX7-NEXT: s_lshr_b32 s4, s16, 16 +; GFX7-NEXT: s_lshr_b32 s5, s17, 16 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, s5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, s17 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, s16 -; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX7-NEXT: v_max_f32_e32 v4, v1, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, s17 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, s16 +; GFX7-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX7-NEXT: v_max_f32_e32 v2, v1, v0 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v1, v3, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX7-NEXT: v_max_f32_e32 v1, v5, v4 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v5, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 @@ -1107,28 +1111,28 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX7-LABEL: v_maximum_v3f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v6, v5, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc ; GFX7-NEXT: v_max_f32_e32 v6, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_max_f32_e32 v5, v1, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1241,21 +1245,21 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) { ; GFX7-LABEL: v_maximum_v3f16__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1307,28 +1311,28 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX7-LABEL: v_maximum_v3f16__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v6, v5, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc ; GFX7-NEXT: v_max_f32_e32 v6, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_max_f32_e32 v5, v1, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1441,21 +1445,21 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1) ; GFX7-LABEL: v_maximum_v3f16__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1507,38 +1511,38 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX7-LABEL: v_maximum_v4f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_max_f32_e32 v8, v7, v6 +; GFX7-NEXT: v_max_f32_e32 v8, v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v7, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc -; GFX7-NEXT: v_max_f32_e32 v7, v5, v4 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v5, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v9, v7, vcc -; GFX7-NEXT: v_max_f32_e32 v5, v1, v3 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v9, v8, vcc +; GFX7-NEXT: v_max_f32_e32 v5, v6, v7 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v6, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc +; GFX7-NEXT: v_max_f32_e32 v6, v1, v3 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc ; GFX7-NEXT: v_max_f32_e32 v3, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v3, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1672,29 +1676,29 @@ define <4 x half> @v_maximum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) { ; GFX7-LABEL: v_maximum_v4f16__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_max_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1748,38 +1752,38 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX7-LABEL: v_maximum_v4f16__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_max_f32_e32 v8, v7, v6 +; GFX7-NEXT: v_max_f32_e32 v8, v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v7, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc -; GFX7-NEXT: v_max_f32_e32 v7, v5, v4 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v5, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v9, v7, vcc -; GFX7-NEXT: v_max_f32_e32 v5, v1, v3 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v9, v8, vcc +; GFX7-NEXT: v_max_f32_e32 v5, v6, v7 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v6, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc +; GFX7-NEXT: v_max_f32_e32 v6, v1, v3 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc ; GFX7-NEXT: v_max_f32_e32 v3, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v3, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1913,29 +1917,29 @@ define <4 x half> @v_maximum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1) ; GFX7-LABEL: v_maximum_v4f16__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_max_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1989,71 +1993,71 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX7-LABEL: v_maximum_v8f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-NEXT: v_max_f32_e32 v15, v10, v11 +; GFX7-NEXT: v_mov_b32_e32 v16, 0x7fc00000 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v10, v11 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v14 ; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v10, v16, v15, vcc +; GFX7-NEXT: v_max_f32_e32 v17, v9, v12 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v9, v12 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v14 ; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_f32_e32 v16, v15, v14 -; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v14 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v14, v17, v16, vcc -; GFX7-NEXT: v_max_f32_e32 v15, v13, v12 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v13, v12 +; GFX7-NEXT: v_max_f32_e32 v15, v13, v11 +; GFX7-NEXT: v_cndmask_b32_e32 v9, v16, v17, vcc +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v13, v11 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v12, v17, v15, vcc -; GFX7-NEXT: v_max_f32_e32 v13, v11, v10 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v11, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v11, v16, v15, vcc +; GFX7-NEXT: v_max_f32_e32 v13, v8, v12 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v8, v12 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v10, v17, v13, vcc -; GFX7-NEXT: v_max_f32_e32 v11, v9, v8 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v9, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v17, v11, vcc -; GFX7-NEXT: v_max_f32_e32 v9, v3, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v8, v16, v13, vcc +; GFX7-NEXT: v_max_f32_e32 v12, v3, v7 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v17, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v16, v12, vcc ; GFX7-NEXT: v_max_f32_e32 v7, v2, v6 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v17, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, v16, v7, vcc ; GFX7-NEXT: v_max_f32_e32 v6, v1, v5 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v16, v6, vcc ; GFX7-NEXT: v_max_f32_e32 v5, v0, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v5, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v16, v5, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v10 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v12 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v14 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v11 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v9 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v10 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v8f16: @@ -2268,134 +2272,134 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v16 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v16 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v16 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v13 ; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v16 -; GFX7-NEXT: v_max_f32_e32 v16, v18, v17 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v18, v17 -; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; GFX7-NEXT: v_max_f32_e32 v17, v20, v19 -; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v20, v19 -; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v16 +; GFX7-NEXT: v_max_f32_e32 v16, v19, v18 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v19, v18 +; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; GFX7-NEXT: v_max_f32_e32 v18, v21, v20 +; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v21, v20 +; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18 ; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19 ; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX7-NEXT: v_cvt_f32_f16_e32 v22, v22 ; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-NEXT: v_max_f32_e32 v23, v19, v18 -; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v19, v18 -; GFX7-NEXT: v_max_f32_e32 v18, v22, v20 -; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v22, v20 -; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GFX7-NEXT: v_max_f32_e32 v23, v20, v19 +; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v20, v19 +; GFX7-NEXT: v_max_f32_e32 v19, v22, v21 +; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v22, v21 +; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-NEXT: v_max_f32_e32 v22, v20, v19 -; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v20, v19 -; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_max_f32_e32 v22, v21, v20 +; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v21, v20 +; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v6, v14 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-NEXT: v_max_f32_e32 v24, v20, v19 -; GFX7-NEXT: v_cmp_o_f32_e64 s[12:13], v20, v19 -; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v15 -; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v25, v20, v19 -; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v20, v19 -; GFX7-NEXT: v_max_f32_e32 v19, v6, v14 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v13 -; GFX7-NEXT: v_mov_b32_e32 v21, 0x7fc00000 +; GFX7-NEXT: v_max_f32_e32 v24, v21, v20 +; GFX7-NEXT: v_cmp_o_f32_e64 s[12:13], v21, v20 +; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v15 +; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: v_max_f32_e32 v25, v21, v20 +; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v21, v20 +; GFX7-NEXT: v_max_f32_e32 v20, v6, v14 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v13 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-NEXT: v_max_f32_e32 v13, v5, v6 ; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v5, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v12 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v3, v11 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_max_f32_e32 v14, v6, v5 ; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v15 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v12 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_max_f32_e32 v12, v3, v11 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v21, v16, vcc -; GFX7-NEXT: v_max_f32_e32 v3, v6, v5 -; GFX7-NEXT: v_max_f32_e32 v11, v4, v7 -; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v6, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v7 -; GFX7-NEXT: v_cndmask_b32_e64 v6, v21, v3, s[24:25] -; GFX7-NEXT: v_max_f32_e32 v3, v2, v10 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v21, v11, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; GFX7-NEXT: v_max_f32_e32 v12, v3, v6 +; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v3, v6 +; GFX7-NEXT: v_max_f32_e32 v3, v4, v5 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v5 +; GFX7-NEXT: v_max_f32_e32 v6, v7, v11 +; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v7, v11 +; GFX7-NEXT: v_max_f32_e32 v7, v2, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 -; GFX7-NEXT: v_max_f32_e32 v5, v1, v9 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v21, v3, vcc -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 -; GFX7-NEXT: v_max_f32_e32 v7, v0, v8 -; GFX7-NEXT: v_cndmask_b32_e64 v20, v21, v24, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v21, v5, vcc +; GFX7-NEXT: v_max_f32_e32 v4, v0, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v21, v17, v24, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e32 v2, v17, v7, vcc ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX7-NEXT: v_cndmask_b32_e64 v16, v21, v17, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v17, v21, v18, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v18, v21, v22, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v21, v7, vcc -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v20 +; GFX7-NEXT: v_cndmask_b32_e64 v11, v17, v18, s[10:11] +; GFX7-NEXT: v_max_f32_e32 v15, v1, v9 +; GFX7-NEXT: v_cndmask_b32_e64 v18, v17, v22, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v21 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v18 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v15, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v18 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cndmask_b32_e64 v11, v21, v23, s[6:7] -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v17 -; GFX7-NEXT: v_cndmask_b32_e64 v12, v21, v12, s[22:23] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v11 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_cndmask_b32_e64 v14, v21, v14, s[20:21] -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v13, v21, v13, s[18:19] -; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v14 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v13 +; GFX7-NEXT: v_cndmask_b32_e64 v19, v17, v19, s[6:7] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v22, v21, v25, s[14:15] -; GFX7-NEXT: v_cndmask_b32_e64 v19, v21, v19, s[16:17] -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v15 -; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v19 -; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v22 +; GFX7-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[24:25] +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v19 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v23, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v6 -; GFX7-NEXT: v_or_b32_e32 v7, v10, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GFX7-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[22:23] +; GFX7-NEXT: v_cndmask_b32_e64 v22, v17, v25, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v20, v17, v20, s[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v14, v17, v14, s[20:21] +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v12 +; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v22 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v20 +; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v14 +; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v13 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v4, v11, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 +; GFX7-NEXT: v_or_b32_e32 v6, v10, v6 +; GFX7-NEXT: v_or_b32_e32 v7, v9, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index de24617e058dd..69f17ed072425 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -442,14 +442,14 @@ define amdgpu_kernel void @maxnum_v2f16( ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s5, s2, 16 ; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-NEXT: v_max_f32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_max_f32_e32 v1, v2, v3 +; SI-NEXT: v_max_f32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -731,24 +731,24 @@ define amdgpu_kernel void @maxnum_v3f16( ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: s_lshr_b32 s7, s6, 16 -; SI-NEXT: s_lshr_b32 s8, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: s_lshr_b32 s8, s6, 16 +; SI-NEXT: s_lshr_b32 s9, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 -; SI-NEXT: v_max_f32_e32 v1, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_max_f32_e32 v1, v2, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_max_f32_e32 v2, v3, v4 -; SI-NEXT: v_max_f32_e32 v0, v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v0, v3, v0 +; SI-NEXT: v_max_f32_e32 v2, v5, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: maxnum_v3f16: @@ -859,39 +859,39 @@ define amdgpu_kernel void @maxnum_v4f16( ; SI-LABEL: maxnum_v4f16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[6:7], s[10:11], 0x0 -; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0 +; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: s_lshr_b32 s6, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: s_lshr_b32 s6, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s6, s5, 16 -; SI-NEXT: s_lshr_b32 s4, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 -; SI-NEXT: v_max_f32_e32 v3, v3, v5 -; SI-NEXT: v_max_f32_e32 v2, v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_max_f32_e32 v1, v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_max_f32_e32 v0, v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_lshr_b32 s9, s5, 16 +; SI-NEXT: s_lshr_b32 s10, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: s_lshr_b32 s8, s4, 16 +; SI-NEXT: s_lshr_b32 s5, s6, 16 +; SI-NEXT: v_max_f32_e32 v0, v1, v0 +; SI-NEXT: v_max_f32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v3, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -1009,24 +1009,24 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_lshr_b32 s5, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 -; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: v_max_f32_e32 v2, 4.0, v2 +; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_max_f32_e32 v3, 2.0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v1, 0x41000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index e79324d7655fc..0e91d905d5585 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -473,14 +473,14 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s5, s2, 16 ; SI-NEXT: s_lshr_b32 s6, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NEXT: v_min_f32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_min_f32_e32 v1, v2, v3 +; SI-NEXT: v_min_f32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -576,13 +576,13 @@ entry: define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) #0 { ; SI-LABEL: minnum_v2f16_no_ieee: ; SI: ; %bb.0: -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_min_f32_e32 v2, v3, v2 +; SI-NEXT: v_min_f32_e32 v2, v2, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -796,24 +796,24 @@ define amdgpu_kernel void @minnum_v3f16( ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 -; SI-NEXT: s_lshr_b32 s7, s6, 16 -; SI-NEXT: s_lshr_b32 s8, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: s_lshr_b32 s8, s6, 16 +; SI-NEXT: s_lshr_b32 s9, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 -; SI-NEXT: v_min_f32_e32 v1, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s7 +; SI-NEXT: v_min_f32_e32 v1, v2, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_min_f32_e32 v2, v3, v4 -; SI-NEXT: v_min_f32_e32 v0, v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v0, v3, v0 +; SI-NEXT: v_min_f32_e32 v2, v5, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: minnum_v3f16: @@ -923,39 +923,39 @@ define amdgpu_kernel void @minnum_v4f16( ; SI-LABEL: minnum_v4f16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[6:7], s[10:11], 0x0 -; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0 +; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: s_lshr_b32 s6, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: s_lshr_b32 s6, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s6, s5, 16 -; SI-NEXT: s_lshr_b32 s4, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 -; SI-NEXT: v_min_f32_e32 v3, v3, v5 -; SI-NEXT: v_min_f32_e32 v2, v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_min_f32_e32 v1, v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_min_f32_e32 v0, v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_lshr_b32 s9, s5, 16 +; SI-NEXT: s_lshr_b32 s10, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: s_lshr_b32 s8, s4, 16 +; SI-NEXT: s_lshr_b32 s5, s6, 16 +; SI-NEXT: v_min_f32_e32 v0, v1, v0 +; SI-NEXT: v_min_f32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_min_f32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v3, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -1072,24 +1072,24 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_lshr_b32 s5, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_lshr_b32 s4, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 -; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: v_min_f32_e32 v2, 4.0, v2 +; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v3, 2.0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v1, 0x41000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll index b7fc76aecf080..2989ff02c9e8e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -143,19 +143,19 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x3e22f983, v1 -; GFX6-NEXT: v_fract_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0 +; GFX6-NEXT: v_fract_f32_e32 v1, v1 +; GFX6-NEXT: v_sin_f32_e32 v1, v1 ; GFX6-NEXT: v_fract_f32_e32 v0, v0 ; GFX6-NEXT: v_sin_f32_e32 v0, v0 -; GFX6-NEXT: v_sin_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 9778c61c44e6e..90bf849483196 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -6017,97 +6017,77 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-LABEL: local_atomic_fadd_ret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_ret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %val seq_cst @@ -6252,98 +6232,78 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-LABEL: local_atomic_fadd_ret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fadd ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -6482,41 +6442,32 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-LABEL: local_atomic_fadd_noret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6526,41 +6477,32 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-LABEL: local_atomic_fadd_noret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6702,41 +6644,32 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-LABEL: local_atomic_fadd_noret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6746,42 +6679,33 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-LABEL: local_atomic_fadd_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 91add012bdcfa..a71938582da52 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -5687,97 +5687,77 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-LABEL: local_atomic_fmax_ret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_ret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmax ptr addrspace(3) %ptr, <2 x half> %val seq_cst @@ -5970,98 +5950,78 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-LABEL: local_atomic_fmax_ret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmax ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -6247,41 +6207,32 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-LABEL: local_atomic_fmax_noret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6291,41 +6242,32 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-LABEL: local_atomic_fmax_noret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6514,41 +6456,32 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-LABEL: local_atomic_fmax_noret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6558,42 +6491,33 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-LABEL: local_atomic_fmax_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 8597c2e256584..19a0d8bd717f7 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -5687,97 +5687,77 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-LABEL: local_atomic_fmin_ret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_ret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmin ptr addrspace(3) %ptr, <2 x half> %val seq_cst @@ -5970,98 +5950,78 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-LABEL: local_atomic_fmin_ret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fmin ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -6247,41 +6207,32 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-LABEL: local_atomic_fmin_noret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6291,41 +6242,32 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-LABEL: local_atomic_fmin_noret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6514,41 +6456,32 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-LABEL: local_atomic_fmin_noret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6558,42 +6491,33 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-LABEL: local_atomic_fmin_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 290d3117cac9a..e560215e4c066 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -6510,97 +6510,77 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX7-LABEL: local_atomic_fsub_ret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_ret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fsub ptr addrspace(3) %ptr, <2 x half> %val seq_cst @@ -6776,98 +6756,78 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX7-LABEL: local_atomic_fsub_ret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 %result = atomicrmw fsub ptr addrspace(3) %gep, <2 x half> %val seq_cst @@ -7034,41 +6994,32 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX7-LABEL: local_atomic_fsub_noret_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7078,41 +7029,32 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX6-LABEL: local_atomic_fsub_noret_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7282,41 +7224,32 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX7-LABEL: local_atomic_fsub_noret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_sub_f32_e32 v6, v6, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 offset:65532 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7326,42 +7259,33 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX6-LABEL: local_atomic_fsub_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX6-NEXT: ds_read_b32 v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v7, v3, v4 -; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_sub_f32_e32 v6, v6, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll index 7dc9304d5715b..7044afb09e371 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -38,9 +38,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %s ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -104,9 +104,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -179,9 +179,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: v_and_b32_e32 v1, 0xffff, v3 @@ -255,9 +255,9 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -337,9 +337,9 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -434,9 +434,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half % ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -498,12 +498,16 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_max_f32_e32 v0, 0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -581,16 +585,20 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: s_mov_b32 s7, 0xf000 ; SDAG-CI-NEXT: s_mov_b32 s6, -1 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SDAG-CI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) +; SDAG-CI-NEXT: v_max_f32_e32 v1, 0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index 87d33c1c063eb..154d6c7079672 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -136,9 +136,9 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo(half %src0, half %src1, half %src ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -197,9 +197,9 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush(half %src0, half %src1, ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_fma_f32 v0, v0, v1, v2 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -252,8 +252,8 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2 ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -304,11 +304,15 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_max_f32_e32 v0, 0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -378,8 +382,8 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -469,12 +473,12 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v3, v5, v4 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v3 @@ -601,15 +605,15 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-CI-NEXT: v_mac_f32_e32 v6, v8, v7 ; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v6 @@ -777,36 +781,36 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half ; SDAG-CI-LABEL: v_mad_mix_v4f32: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_mac_f32_e32 v9, v11, v10 -; SDAG-CI-NEXT: v_mac_f32_e32 v6, v8, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_mac_f32_e32 v6, v10, v8 +; SDAG-CI-NEXT: v_mac_f32_e32 v7, v11, v9 ; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v9 ; SDAG-CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v4 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SDAG-CI-NEXT: v_or_b32_e32 v0, v1, v0 -; SDAG-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SDAG-CI-NEXT: v_or_b32_e32 v1, v3, v1 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v2, v0 +; SDAG-CI-NEXT: v_or_b32_e32 v1, v3, v4 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v4f32: @@ -971,22 +975,30 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v3, v5, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_max_f32_e32 v1, 0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_max_f32_e32 v0, 0, v0 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SDAG-CI-NEXT: v_or_b32_e32 v0, v1, v0 +; SDAG-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt: @@ -1140,29 +1152,41 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-CI-NEXT: v_mac_f32_e32 v6, v8, v7 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v4 ; SDAG-CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_max_f32_e32 v1, 0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_max_f32_e32 v0, 0, v0 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_max_f32_e32 v2, 0, v2 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SDAG-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_min_f32_e32 v1, 1.0, v2 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SDAG-CI-NEXT: v_or_b32_e32 v0, v2, v0 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v0, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_v3f32_clamp_postcvt: @@ -1357,42 +1381,58 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_mac_f32_e32 v6, v8, v7 -; SDAG-CI-NEXT: v_mac_f32_e32 v9, v11, v10 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_mac_f32_e32 v6, v10, v8 +; SDAG-CI-NEXT: v_mac_f32_e32 v7, v11, v9 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v6 ; SDAG-CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v4 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_max_f32_e32 v0, 0, v0 +; SDAG-CI-NEXT: v_max_f32_e32 v1, 0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_max_f32_e32 v2, 0, v2 +; SDAG-CI-NEXT: v_max_f32_e32 v3, 0, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 +; SDAG-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_min_f32_e32 v3, 1.0, v3 +; SDAG-CI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SDAG-CI-NEXT: v_or_b32_e32 v0, v1, v0 -; SDAG-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v2, v0 ; SDAG-CI-NEXT: v_or_b32_e32 v1, v3, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1560,22 +1600,26 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> ; SDAG-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_mac_f32_e32 v2, v5, v4 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_mac_f32_e32 v5, v3, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v5 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp -; SDAG-CI-NEXT: v_mac_f32_e32 v3, v0, v1 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_max_f32_e32 v3, 0, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_min_f32_e32 v1, 1.0, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SDAG-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1727,18 +1771,22 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mac_f32_e32 v3, v5, v4 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_max_f32_e32 v3, 0, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_min_f32_e32 v0, 1.0, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SDAG-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -1906,12 +1954,12 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mad_f32 v3, v5, v4, v3 clamp ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v3 @@ -2078,15 +2126,15 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-CI-NEXT: v_mad_f32 v6, v8, v7, v6 clamp ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v6 @@ -2284,35 +2332,35 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr ; SDAG-CI-LABEL: v_mad_mix_v4f32_clamp_precvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_mad_f32 v6, v8, v7, v6 clamp -; SDAG-CI-NEXT: v_mad_f32 v9, v11, v10, v9 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_mad_f32 v6, v10, v8, v6 clamp +; SDAG-CI-NEXT: v_mad_f32 v7, v11, v9, v7 clamp +; SDAG-CI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v6 -; SDAG-CI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v7 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v9 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SDAG-CI-NEXT: v_or_b32_e32 v0, v0, v2 -; SDAG-CI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SDAG-CI-NEXT: v_or_b32_e32 v0, v0, v3 ; SDAG-CI-NEXT: v_or_b32_e32 v1, v1, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll index ee250fc74c7ae..fcd9dae983cfb 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll @@ -53,24 +53,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2 ; VI-NEXT: v_mac_f32_e32 v0, v3, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SDAG-CI-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GISEL-CI-NEXT: v_mac_f32_e32 v0, v3, v1 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; CI-NEXT: v_mac_f32_e32 v0, v3, v1 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -185,11 +175,10 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> % ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SDAG-CI-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_mac_f32_e32 v0, v3, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt: @@ -271,16 +260,14 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mac_f32_e32 v3, v5, v4 -; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SDAG-CI-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-CI-NEXT: v_mov_b32_e32 v1, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_mac_f32_e32 v1, v5, v4 +; SDAG-CI-NEXT: v_mac_f32_e32 v0, v6, v7 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX9GEN-LABEL: v_mad_mix_v2f32: @@ -386,17 +373,16 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1, ; SDAG-CI-LABEL: v_mad_mix_v2f32_shuffle: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SDAG-CI-NEXT: v_mad_f32 v0, v4, v1, v2 -; SDAG-CI-NEXT: v_mac_f32_e32 v2, v5, v3 -; SDAG-CI-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SDAG-CI-NEXT: v_mad_f32 v0, v4, v0, v1 +; SDAG-CI-NEXT: v_mac_f32_e32 v1, v5, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_v2f32_shuffle: @@ -463,9 +449,9 @@ define float @v_mad_mix_f32_negf16lo_f16lo_f16lo(half %src0, half %src1, half %s ; SDAG-CI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mad_f32 v0, -v0, v1, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -543,11 +529,11 @@ define float @v_mad_mix_f32_absf16lo_f16lo_f16lo(half %src0, half %src1, half %s ; SDAG-CI-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SDAG-CI-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-CI-NEXT: v_mac_f32_e32 v1, v0, v3 +; SDAG-CI-NEXT: v_mov_b32_e32 v0, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo: @@ -606,8 +592,8 @@ define float @v_mad_mix_f32_negabsf16lo_f16lo_f16lo(half %src0, half %src1, half ; SDAG-CI-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; SDAG-CI-NEXT: v_mad_f32 v0, -v0, v1, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -664,21 +650,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32(half %src0, half %src1, float %src2) ; VI-NEXT: v_mad_f32 v0, v0, v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, v2 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) @@ -720,21 +698,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_negf32(half %src0, half %src1, float %sr ; VI-NEXT: v_mad_f32 v0, v0, v1, -v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, -v2 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, -v2 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mad_f32 v0, v0, v1, -v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.neg = fneg float %src2 @@ -777,21 +747,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_absf32(half %src0, half %src1, float %sr ; VI-NEXT: v_mad_f32 v0, v0, v1, |v2| ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, |v2| -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, |v2| -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mad_f32 v0, v0, v1, |v2| +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.abs = call float @llvm.fabs.f32(float %src2) @@ -834,21 +796,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_negabsf32(half %src0, half %src1, float ; VI-NEXT: v_mad_f32 v0, v0, v1, -|v2| ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, -|v2| -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, -|v2| -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mad_f32 v0, v0, v1, -|v2| +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.abs = call float @llvm.fabs.f32(float %src2) @@ -900,13 +854,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 { ; VI-NEXT: v_mad_f32 v0, v0, v1, 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, 1.0 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mad_f32 v0, v0, v1, 1.0 +; CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: ; GISEL-GFX1100: ; %bb.0: @@ -929,14 +883,6 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 { ; GISEL-GFX906-NEXT: v_mov_b32_e32 v2, 1.0 ; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, 1.0 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float 1.0) @@ -985,8 +931,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0 ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e22f983 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1074,8 +1020,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1) ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e230000 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1177,8 +1123,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 { ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x367c0000 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1293,10 +1239,10 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, 1.0 ; SDAG-CI-NEXT: v_mad_f32 v1, v3, v2, 1.0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -1427,10 +1373,10 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mov_b32_e32 v1, 0x3e230000 ; SDAG-CI-NEXT: v_madak_f32 v0, v0, v4, 0x3e230000 ; SDAG-CI-NEXT: v_mac_f32_e32 v1, v3, v2 @@ -1564,10 +1510,10 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mov_b32_e32 v1, 0x3e22f983 ; SDAG-CI-NEXT: v_madak_f32 v0, v0, v4, 0x3e22f983 ; SDAG-CI-NEXT: v_mac_f32_e32 v1, v3, v2 @@ -1686,9 +1632,9 @@ define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x h ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1841,23 +1787,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals(half %src0, half %sr ; VI-NEXT: v_add_f32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_fma_f32 v0, v0, v1, v2 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-CI-NEXT: v_fma_f32 v0, v0, v1, v2 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_fma_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -1903,21 +1840,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals(half %src0, half %src1, fl ; VI-NEXT: v_add_f32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_fma_f32 v0, v0, v1, v2 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_fma_f32 v0, v0, v1, v2 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_fma_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) @@ -1987,15 +1916,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, ; VI-NEXT: v_add_f32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 -; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v2 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: ; GISEL-GFX1100: ; %bb.0: @@ -2007,16 +1936,6 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, ; GISEL-GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 ; GISEL-GFX1100-NEXT: v_add_f32_e32 v0, v0, v2 ; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-CI-NEXT: v_mul_f32_e32 v0, v0, v1 -; GISEL-CI-NEXT: v_add_f32_e32 v0, v0, v2 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -2082,14 +2001,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half ; VI-NEXT: v_add_f32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 -; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v2 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: ; GISEL-GFX1100: ; %bb.0: @@ -2100,15 +2019,6 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half ; GISEL-GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 ; GISEL-GFX1100-NEXT: v_add_f32_e32 v0, v0, v2 ; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_mul_f32_e32 v0, v0, v1 -; GISEL-CI-NEXT: v_add_f32_e32 v0, v0, v2 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %mul = fmul float %src0.ext, %src1.ext @@ -2153,24 +2063,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd(half %src0, hal ; VI-NEXT: v_mac_f32_e32 v0, v3, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SDAG-CI-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GISEL-CI-NEXT: v_mac_f32_e32 v0, v3, v1 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; CI-NEXT: v_mac_f32_e32 v0, v3, v1 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -2214,21 +2114,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src ; VI-NEXT: v_mad_f32 v0, v0, v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, v2 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %mul = fmul contract float %src0.ext, %src1.ext @@ -2276,9 +2168,9 @@ define float @v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo(i32 %src0.arg, half %src1 ; SDAG-CI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mad_f32 v0, -v0, v1, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2377,11 +2269,11 @@ define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half % ; SDAG-CI-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, |v0| ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, |v3| -; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_mac_f32_e32 v0, v3, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: @@ -2450,25 +2342,15 @@ define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1 ; VI-NEXT: v_mac_f32_e32 v0, v3, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-CI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: -; SDAG-CI: ; %bb.0: -; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, |v3| -; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1 -; SDAG-CI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-CI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: -; GISEL-CI: ; %bb.0: -; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e64 v3, |v0| -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GISEL-CI-NEXT: v_mac_f32_e32 v0, v3, v1 -; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v3, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; CI-NEXT: v_mac_f32_e32 v0, v3, v1 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> %src0 = extractelement <2 x half> %src0.arg.bc, i32 1 %src0.abs = call half @llvm.fabs.f16(half %src0) @@ -2519,11 +2401,11 @@ define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half ; SDAG-CI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, -v0 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, -v3 -; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_mac_f32_e32 v0, v3, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: @@ -2606,11 +2488,11 @@ define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half ; SDAG-CI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, |v0| ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, |v3| -; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_mac_f32_e32 v0, v3, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: @@ -2693,11 +2575,11 @@ define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg, ; SDAG-CI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, -|v0| ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, -|v3| -; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_mac_f32_e32 v0, v3, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll index 086c78fd041fc..806d941ac8730 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll @@ -27,14 +27,23 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16,GFX12-GISEL,GFX12-FAKE16-GISEL %s define half @v_maximumnum_f16(half %x, half %y) { -; GFX7-LABEL: v_maximumnum_f16: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_maximumnum_f16: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_maximumnum_f16: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_f16: ; GFX8-SDAG: ; %bb.0: @@ -905,14 +914,23 @@ define double @v_maximumnum_f64_1.0(double %x) { } define half @v_maximumnum_f16_s_v(half inreg %x, half %y) { -; GFX7-LABEL: v_maximumnum_f16_s_v: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_max_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_maximumnum_f16_s_v: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, s16 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_maximumnum_f16_s_v: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s16 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_f16_s_v: ; GFX8-SDAG: ; %bb.0: @@ -1070,14 +1088,23 @@ define half @v_maximumnum_f16_s_v(half inreg %x, half %y) { } define half @v_maximumnum_f16_v_s(half %x, half inreg %y) { -; GFX7-LABEL: v_maximumnum_f16_v_s: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s16 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_maximumnum_f16_v_s: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, s16 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_maximumnum_f16_v_s: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s16 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_f16_v_s: ; GFX8-SDAG: ; %bb.0: @@ -1235,14 +1262,23 @@ define half @v_maximumnum_f16_v_s(half %x, half inreg %y) { } define half @v_maximumnum_f16_s_s(half inreg %x, half inreg %y) { -; GFX7-LABEL: v_maximumnum_f16_s_s: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, s16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s17 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_maximumnum_f16_s_s: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, s17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, s16 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_maximumnum_f16_s_s: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, s16 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s17 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_f16_s_s: ; GFX8-SDAG: ; %bb.0: @@ -2563,14 +2599,23 @@ define float @v_maximumnum_f32_fneg(float %x, float %y) { } define half @v_maximumnum_f16_fabs_rhs(half %x, half %y) { -; GFX7-LABEL: v_maximumnum_f16_fabs_rhs: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_maximumnum_f16_fabs_rhs: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_maximumnum_f16_fabs_rhs: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_f16_fabs_rhs: ; GFX8-SDAG: ; %bb.0: @@ -2716,9 +2761,8 @@ define half @v_maximumnum_f16_fneg_fabs_rhs(half %x, half %y) { ; GFX7-SDAG-LABEL: v_maximumnum_f16_fneg_fabs_rhs: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -|v1| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2874,14 +2918,23 @@ define half @v_maximumnum_f16_fneg_fabs_rhs(half %x, half %y) { } define half @v_maximumnum_f16_fabs(half %x, half %y) { -; GFX7-LABEL: v_maximumnum_f16_fabs: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_maximumnum_f16_fabs: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_maximumnum_f16_fabs: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_f16_fabs: ; GFX8-SDAG: ; %bb.0: @@ -3028,10 +3081,8 @@ define half @v_maximumnum_f16_fneg(half %x, half %y) { ; GFX7-SDAG-LABEL: v_maximumnum_f16_fneg: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -3306,10 +3357,10 @@ define <2 x half> @v_maximumnum_v2f16(<2 x half> %x, <2 x half> %y) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 @@ -3460,13 +3511,13 @@ define <2 x half> @v_maximumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v2f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3544,19 +3595,19 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3730,21 +3781,21 @@ define <3 x half> @v_maximumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v3f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3833,29 +3884,29 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v4f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v5 -; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v7 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4057,29 +4108,29 @@ define <4 x half> @v_maximumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v4f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v5, v4 -; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v7, v6 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4175,39 +4226,39 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v7 ; GFX7-SDAG-NEXT: v_max_f32_e32 v8, v8, v9 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-SDAG-NEXT: v_max_f32_e32 v10, v10, v11 -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v4 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v8 +; GFX7-SDAG-NEXT: v_max_f32_e32 v9, v10, v11 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 ; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v10 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4465,54 +4516,54 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v8f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_max_f32_e32 v11, v11, v12 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_max_f32_e32 v8, v8, v9 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GFX7-SDAG-NEXT: v_max_f32_e32 v10, v10, v11 +; GFX7-SDAG-NEXT: v_max_f32_e32 v12, v14, v12 +; GFX7-SDAG-NEXT: v_max_f32_e32 v10, v10, v13 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v5 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-SDAG-NEXT: v_max_f32_e32 v12, v12, v13 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-SDAG-NEXT: v_max_f32_e32 v14, v14, v15 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v12 -; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v14 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v10 ; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_v8f16: @@ -4825,101 +4876,101 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v16f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v15 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v13 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GFX7-SDAG-NEXT: v_max_f32_e32 v17, v17, v18 -; GFX7-SDAG-NEXT: v_max_f32_e32 v16, v16, v19 -; GFX7-SDAG-NEXT: v_max_f32_e32 v18, v20, v21 -; GFX7-SDAG-NEXT: v_max_f32_e32 v19, v22, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; GFX7-SDAG-NEXT: v_max_f32_e32 v16, v17, v16 +; GFX7-SDAG-NEXT: v_max_f32_e32 v17, v19, v18 +; GFX7-SDAG-NEXT: v_max_f32_e32 v18, v21, v20 +; GFX7-SDAG-NEXT: v_max_f32_e32 v19, v23, v22 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v10 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GFX7-SDAG-NEXT: v_max_f32_e32 v20, v20, v21 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v20, v21, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GFX7-SDAG-NEXT: v_max_f32_e32 v21, v22, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX7-SDAG-NEXT: v_max_f32_e32 v21, v23, v22 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_max_f32_e32 v22, v22, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_max_f32_e32 v22, v23, v22 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v9 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v22 -; GFX7-SDAG-NEXT: v_max_f32_e32 v23, v23, v24 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_max_f32_e32 v23, v24, v23 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v8 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v23 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v9 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v22 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v10 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v10 -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v8 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v21 -; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v11 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v21 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v20 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v8 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v12 -; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v8 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v19 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v5, v13 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v11 +; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v9 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v18 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v8 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v20 +; GFX7-SDAG-NEXT: v_max_f32_e32 v7, v7, v15 ; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v14 -; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v8 +; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v12 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v16 -; GFX7-SDAG-NEXT: v_max_f32_e32 v7, v7, v15 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v19 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v8 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v10 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v10 +; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v9 ; GFX7-SDAG-NEXT: v_or_b32_e32 v7, v7, v8 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5453,17 +5504,23 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX7-SDAG-LABEL: v_maximumnum_v32f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: buffer_load_dword v48, off, s[0:3], s32 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v36, 16, v28 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v38, 16, v27 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; GFX7-SDAG-NEXT: buffer_load_dword v49, off, s[0:3], s32 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v52, 16, v24 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v23 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v32, v32 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v33, v33 @@ -5472,189 +5529,185 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v36, v36 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v37, v37 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v38, v38 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v39, v39 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v48, v48 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v50, v50 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GFX7-SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX7-SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX7-SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX7-SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v24 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v7 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v22 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GFX7-SDAG-NEXT: v_max_f32_e32 v31, v31, v32 -; GFX7-SDAG-NEXT: v_max_f32_e32 v32, v33, v34 -; GFX7-SDAG-NEXT: v_max_f32_e32 v33, v35, v36 -; GFX7-SDAG-NEXT: v_max_f32_e32 v35, v37, v38 -; GFX7-SDAG-NEXT: v_max_f32_e32 v37, v50, v51 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v4 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v52, v52 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v53, v53 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v54, v54 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v41, v41 +; GFX7-SDAG-NEXT: v_max_f32_e32 v31, v32, v31 +; GFX7-SDAG-NEXT: v_max_f32_e32 v33, v34, v33 +; GFX7-SDAG-NEXT: v_max_f32_e32 v32, v36, v35 +; GFX7-SDAG-NEXT: v_max_f32_e32 v34, v38, v37 +; GFX7-SDAG-NEXT: v_max_f32_e32 v36, v48, v39 +; GFX7-SDAG-NEXT: v_max_f32_e32 v37, v51, v50 +; GFX7-SDAG-NEXT: v_max_f32_e32 v38, v53, v52 +; GFX7-SDAG-NEXT: v_max_f32_e32 v39, v55, v54 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v50, v50 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; GFX7-SDAG-NEXT: v_max_f32_e32 v36, v39, v49 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v49, 16, v21 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GFX7-SDAG-NEXT: v_max_f32_e32 v38, v52, v53 -; GFX7-SDAG-NEXT: v_max_f32_e32 v39, v54, v55 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GFX7-SDAG-NEXT: v_max_f32_e32 v52, v40, v41 -; GFX7-SDAG-NEXT: v_max_f32_e32 v50, v50, v51 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v19 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v18 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v40, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v51, v51 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v52, v52 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v53, v53 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v54, v54 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v55, v55 +; GFX7-SDAG-NEXT: v_max_f32_e32 v50, v51, v50 +; GFX7-SDAG-NEXT: v_max_f32_e32 v51, v53, v52 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v52, v55, v54 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v53, v53 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v54, v54 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: v_max_f32_e32 v53, v54, v53 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v53 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v40, v40 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GFX7-SDAG-NEXT: v_max_f32_e32 v49, v43, v49 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v43, 16, v16 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v44, 16, v5 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v42, 16, v15 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GFX7-SDAG-NEXT: v_max_f32_e32 v51, v51, v53 -; GFX7-SDAG-NEXT: v_max_f32_e32 v53, v54, v55 -; GFX7-SDAG-NEXT: v_max_f32_e32 v54, v40, v41 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v40, v14 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(4) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v34, 16, v48 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GFX7-SDAG-NEXT: v_max_f32_e32 v34, v42, v34 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v42, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v44, v44 +; GFX7-SDAG-NEXT: v_max_f32_e32 v48, v41, v40 +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v35, 16, v49 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v51 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GFX7-SDAG-NEXT: v_max_f32_e32 v14, v15, v48 -; GFX7-SDAG-NEXT: v_max_f32_e32 v15, v40, v30 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v35, v35 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v55, v55 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v41, v41 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-SDAG-NEXT: v_max_f32_e32 v40, v44, v43 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v35, v42, v35 +; GFX7-SDAG-NEXT: v_max_f32_e32 v54, v41, v55 +; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v40 +; GFX7-SDAG-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_max_f32_e32 v55, v42, v43 -; GFX7-SDAG-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX7-SDAG-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX7-SDAG-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX7-SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v54 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v55 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v54 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v53 -; GFX7-SDAG-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v51 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v52 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v5, v21 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 ; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v50 -; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v49 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v52 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GFX7-SDAG-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v48 +; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v17 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v39 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v25, v25 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v9, v9, v25 ; GFX7-SDAG-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX7-SDAG-NEXT: v_or_b32_e32 v7, v7, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v38 -; GFX7-SDAG-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-SDAG-NEXT: v_or_b32_e32 v7, v7, v17 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v37 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v27, v27 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v8, v8, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v11, v11, v27 ; GFX7-SDAG-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX7-SDAG-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v8, v8, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v36 -; GFX7-SDAG-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v35 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v34 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v49, v49 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v29, v29 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v10, v10, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; GFX7-SDAG-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX7-SDAG-NEXT: v_or_b32_e32 v11, v11, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v33 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_max_f32_e32 v15, v15, v49 +; GFX7-SDAG-NEXT: v_max_f32_e32 v14, v14, v30 ; GFX7-SDAG-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX7-SDAG-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX7-SDAG-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v35 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v33 +; GFX7-SDAG-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v31 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v32 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v32 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v14, v14 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v12, v12, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; GFX7-SDAG-NEXT: v_or_b32_e32 v13, v13, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v31 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v34 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v14 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v14, v15, v14 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; GFX7-SDAG-NEXT: v_or_b32_e32 v15, v18, v15 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX7-SDAG-NEXT: v_or_b32_e32 v13, v13, v18 +; GFX7-SDAG-NEXT: v_or_b32_e32 v14, v14, v17 +; GFX7-SDAG-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -8536,13 +8589,13 @@ define <2 x half> @v_maximumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 { ; GFX7-SDAG-LABEL: v_maximumnum_v2f16_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -8634,13 +8687,13 @@ define <2 x half> @v_maximumnum_v2f16_nnan_no_ieee(<2 x half> %x, <2 x half> %y) ; GFX7-SDAG-LABEL: v_maximumnum_v2f16_nnan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -8716,21 +8769,21 @@ define <3 x half> @v_maximumnum_v3f16_nnan_no_ieee(<3 x half> %x, <3 x half> %y) ; GFX7-SDAG-LABEL: v_maximumnum_v3f16_nnan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -8819,29 +8872,29 @@ define <4 x half> @v_maximumnum_v4f16_nnan_no_ieee(<4 x half> %x, <4 x half> %y) ; GFX7-SDAG-LABEL: v_maximumnum_v4f16_nnan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_max_f32_e32 v4, v5, v4 -; GFX7-SDAG-NEXT: v_max_f32_e32 v6, v7, v6 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-SDAG-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll index 0311caf93a14e..8c98931b02933 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll @@ -27,14 +27,23 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16,GFX12-GISEL,GFX12-FAKE16-GISEL %s define half @v_minimumnum_f16(half %x, half %y) { -; GFX7-LABEL: v_minimumnum_f16: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_minimumnum_f16: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_minimumnum_f16: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_f16: ; GFX8-SDAG: ; %bb.0: @@ -905,14 +914,23 @@ define double @v_minimumnum_f64_1.0(double %x) { } define half @v_minimumnum_f16_v_s(half %x, half inreg %y) { -; GFX7-LABEL: v_minimumnum_f16_v_s: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s16 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_minimumnum_f16_v_s: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, s16 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_minimumnum_f16_v_s: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s16 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_f16_v_s: ; GFX8-SDAG: ; %bb.0: @@ -1070,14 +1088,23 @@ define half @v_minimumnum_f16_v_s(half %x, half inreg %y) { } define half @v_minimumnum_f16_s_s(half inreg %x, half inreg %y) { -; GFX7-LABEL: v_minimumnum_f16_s_s: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, s16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s17 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_minimumnum_f16_s_s: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, s17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, s16 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v1, v0 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_minimumnum_f16_s_s: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, s16 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s17 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_f16_s_s: ; GFX8-SDAG: ; %bb.0: @@ -2398,14 +2425,23 @@ define float @v_minimumnum_f32_fneg(float %x, float %y) { } define half @v_minimumnum_f16_fabs_rhs(half %x, half %y) { -; GFX7-LABEL: v_minimumnum_f16_fabs_rhs: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_minimumnum_f16_fabs_rhs: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_minimumnum_f16_fabs_rhs: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_f16_fabs_rhs: ; GFX8-SDAG: ; %bb.0: @@ -2551,9 +2587,8 @@ define half @v_minimumnum_f16_fneg_fabs_rhs(half %x, half %y) { ; GFX7-SDAG-LABEL: v_minimumnum_f16_fneg_fabs_rhs: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -|v1| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2709,14 +2744,23 @@ define half @v_minimumnum_f16_fneg_fabs_rhs(half %x, half %y) { } define half @v_minimumnum_f16_fabs(half %x, half %y) { -; GFX7-LABEL: v_minimumnum_f16_fabs: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: v_minimumnum_f16_fabs: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: v_minimumnum_f16_fabs: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_f16_fabs: ; GFX8-SDAG: ; %bb.0: @@ -2863,10 +2907,8 @@ define half @v_minimumnum_f16_fneg(half %x, half %y) { ; GFX7-SDAG-LABEL: v_minimumnum_f16_fneg: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -3141,10 +3183,10 @@ define <2 x half> @v_minimumnum_v2f16(<2 x half> %x, <2 x half> %y) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 @@ -3295,13 +3337,13 @@ define <2 x half> @v_minimumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v2f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3379,19 +3421,19 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3565,21 +3607,21 @@ define <3 x half> @v_minimumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v3f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3668,29 +3710,29 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v4f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v5 -; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v7 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3892,29 +3934,29 @@ define <4 x half> @v_minimumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v4f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v5, v4 -; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v7, v6 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4010,39 +4052,39 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v7 ; GFX7-SDAG-NEXT: v_min_f32_e32 v8, v8, v9 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-SDAG-NEXT: v_min_f32_e32 v10, v10, v11 -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v4 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v8 +; GFX7-SDAG-NEXT: v_min_f32_e32 v9, v10, v11 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 ; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v10 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4300,54 +4342,54 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v8f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_min_f32_e32 v11, v11, v12 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_min_f32_e32 v8, v8, v9 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GFX7-SDAG-NEXT: v_min_f32_e32 v10, v10, v11 +; GFX7-SDAG-NEXT: v_min_f32_e32 v12, v14, v12 +; GFX7-SDAG-NEXT: v_min_f32_e32 v10, v10, v13 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v5 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-SDAG-NEXT: v_min_f32_e32 v12, v12, v13 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-SDAG-NEXT: v_min_f32_e32 v14, v14, v15 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v12 -; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v14 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v10 ; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_v8f16: @@ -4660,101 +4702,101 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v16f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v15 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v13 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GFX7-SDAG-NEXT: v_min_f32_e32 v17, v17, v18 -; GFX7-SDAG-NEXT: v_min_f32_e32 v16, v16, v19 -; GFX7-SDAG-NEXT: v_min_f32_e32 v18, v20, v21 -; GFX7-SDAG-NEXT: v_min_f32_e32 v19, v22, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; GFX7-SDAG-NEXT: v_min_f32_e32 v16, v17, v16 +; GFX7-SDAG-NEXT: v_min_f32_e32 v17, v19, v18 +; GFX7-SDAG-NEXT: v_min_f32_e32 v18, v21, v20 +; GFX7-SDAG-NEXT: v_min_f32_e32 v19, v23, v22 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v10 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GFX7-SDAG-NEXT: v_min_f32_e32 v20, v20, v21 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v20, v21, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GFX7-SDAG-NEXT: v_min_f32_e32 v21, v22, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX7-SDAG-NEXT: v_min_f32_e32 v21, v23, v22 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_min_f32_e32 v22, v22, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_min_f32_e32 v22, v23, v22 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v23, 16, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v9 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v22 -; GFX7-SDAG-NEXT: v_min_f32_e32 v23, v23, v24 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_min_f32_e32 v23, v24, v23 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v8 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v23 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v9 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v22 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v10 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v10 -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v8 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v21 -; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v11 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v21 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v20 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v8 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v12 -; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v8 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v19 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v5, v13 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v11 +; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v9 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v18 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v8 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v20 +; GFX7-SDAG-NEXT: v_min_f32_e32 v7, v7, v15 ; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v14 -; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v8 +; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v12 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v16 -; GFX7-SDAG-NEXT: v_min_f32_e32 v7, v7, v15 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v19 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v8 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v10 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v10 +; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v9 ; GFX7-SDAG-NEXT: v_or_b32_e32 v7, v7, v8 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5288,17 +5330,23 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX7-SDAG-LABEL: v_minimumnum_v32f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: buffer_load_dword v48, off, s[0:3], s32 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v36, 16, v28 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v38, 16, v27 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; GFX7-SDAG-NEXT: buffer_load_dword v49, off, s[0:3], s32 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v52, 16, v24 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v23 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v31, v31 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v32, v32 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v33, v33 @@ -5307,189 +5355,185 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v36, v36 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v37, v37 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v38, v38 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v39, v39 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v48, v48 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v50, v50 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GFX7-SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX7-SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX7-SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX7-SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v24 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v7 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v23 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v22 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GFX7-SDAG-NEXT: v_min_f32_e32 v31, v31, v32 -; GFX7-SDAG-NEXT: v_min_f32_e32 v32, v33, v34 -; GFX7-SDAG-NEXT: v_min_f32_e32 v33, v35, v36 -; GFX7-SDAG-NEXT: v_min_f32_e32 v35, v37, v38 -; GFX7-SDAG-NEXT: v_min_f32_e32 v37, v50, v51 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v4 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v20 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v52, v52 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v53, v53 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v54, v54 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v41, v41 +; GFX7-SDAG-NEXT: v_min_f32_e32 v31, v32, v31 +; GFX7-SDAG-NEXT: v_min_f32_e32 v33, v34, v33 +; GFX7-SDAG-NEXT: v_min_f32_e32 v32, v36, v35 +; GFX7-SDAG-NEXT: v_min_f32_e32 v34, v38, v37 +; GFX7-SDAG-NEXT: v_min_f32_e32 v36, v48, v39 +; GFX7-SDAG-NEXT: v_min_f32_e32 v37, v51, v50 +; GFX7-SDAG-NEXT: v_min_f32_e32 v38, v53, v52 +; GFX7-SDAG-NEXT: v_min_f32_e32 v39, v55, v54 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v50, v50 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; GFX7-SDAG-NEXT: v_min_f32_e32 v36, v39, v49 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v49, 16, v21 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GFX7-SDAG-NEXT: v_min_f32_e32 v38, v52, v53 -; GFX7-SDAG-NEXT: v_min_f32_e32 v39, v54, v55 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GFX7-SDAG-NEXT: v_min_f32_e32 v52, v40, v41 -; GFX7-SDAG-NEXT: v_min_f32_e32 v50, v50, v51 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v51, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v19 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v18 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v40, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v51, v51 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v52, v52 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v53, v53 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v54, v54 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v55, v55 +; GFX7-SDAG-NEXT: v_min_f32_e32 v50, v51, v50 +; GFX7-SDAG-NEXT: v_min_f32_e32 v51, v53, v52 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v52, v55, v54 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v53, v53 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v54, v54 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX7-SDAG-NEXT: v_min_f32_e32 v53, v54, v53 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v53 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v40, v40 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GFX7-SDAG-NEXT: v_min_f32_e32 v49, v43, v49 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v43, 16, v16 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v44, 16, v5 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v42, 16, v15 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GFX7-SDAG-NEXT: v_min_f32_e32 v51, v51, v53 -; GFX7-SDAG-NEXT: v_min_f32_e32 v53, v54, v55 -; GFX7-SDAG-NEXT: v_min_f32_e32 v54, v40, v41 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v40, v14 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(4) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v34, 16, v48 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GFX7-SDAG-NEXT: v_min_f32_e32 v34, v42, v34 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v42, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v44, v44 +; GFX7-SDAG-NEXT: v_min_f32_e32 v48, v41, v40 +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(5) +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v35, 16, v49 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v51 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GFX7-SDAG-NEXT: v_min_f32_e32 v14, v15, v48 -; GFX7-SDAG-NEXT: v_min_f32_e32 v15, v40, v30 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v35, v35 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v55, v55 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v41, v41 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-SDAG-NEXT: v_min_f32_e32 v40, v44, v43 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v35, v42, v35 +; GFX7-SDAG-NEXT: v_min_f32_e32 v54, v41, v55 +; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v40 +; GFX7-SDAG-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_min_f32_e32 v55, v42, v43 -; GFX7-SDAG-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX7-SDAG-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX7-SDAG-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX7-SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v54 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v55 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v54 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v53 -; GFX7-SDAG-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v51 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v52 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v5, v21 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 ; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX7-SDAG-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v2, v2, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v50 -; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v5, v21 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v49 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v23, v23 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v52 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GFX7-SDAG-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX7-SDAG-NEXT: v_or_b32_e32 v4, v4, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v48 +; GFX7-SDAG-NEXT: v_or_b32_e32 v5, v5, v17 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v39 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v25, v25 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v9, v9, v25 ; GFX7-SDAG-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX7-SDAG-NEXT: v_or_b32_e32 v7, v7, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v6, v6, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v38 -; GFX7-SDAG-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-SDAG-NEXT: v_or_b32_e32 v7, v7, v17 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v37 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v27, v27 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v8, v8, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v11, v11, v27 ; GFX7-SDAG-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX7-SDAG-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v8, v8, v16 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v36 -; GFX7-SDAG-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v35 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v34 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v49, v49 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v10, v10 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v29, v29 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v10, v10, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; GFX7-SDAG-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX7-SDAG-NEXT: v_or_b32_e32 v11, v11, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v33 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_min_f32_e32 v15, v15, v49 +; GFX7-SDAG-NEXT: v_min_f32_e32 v14, v14, v30 ; GFX7-SDAG-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX7-SDAG-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX7-SDAG-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v35 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v33 +; GFX7-SDAG-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v31 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v19, v32 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v32 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v14, v14 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v12, v12, v16 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; GFX7-SDAG-NEXT: v_or_b32_e32 v13, v13, v16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v16, v31 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v17, v34 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v18, v14 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; GFX7-SDAG-NEXT: v_or_b32_e32 v14, v15, v14 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; GFX7-SDAG-NEXT: v_or_b32_e32 v15, v18, v15 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-SDAG-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX7-SDAG-NEXT: v_or_b32_e32 v13, v13, v18 +; GFX7-SDAG-NEXT: v_or_b32_e32 v14, v14, v17 +; GFX7-SDAG-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -8371,13 +8415,13 @@ define <2 x half> @v_minimumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 { ; GFX7-SDAG-LABEL: v_minimumnum_v2f16_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -8469,13 +8513,13 @@ define <2 x half> @v_minimumnum_v2f16_nnan_no_ieee(<2 x half> %x, <2 x half> %y) ; GFX7-SDAG-LABEL: v_minimumnum_v2f16_nnan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX7-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -8551,21 +8595,21 @@ define <3 x half> @v_minimumnum_v3f16_nnan_no_ieee(<3 x half> %x, <3 x half> %y) ; GFX7-SDAG-LABEL: v_minimumnum_v3f16_nnan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -8654,29 +8698,29 @@ define <4 x half> @v_minimumnum_v4f16_nnan_no_ieee(<4 x half> %x, <4 x half> %y) ; GFX7-SDAG-LABEL: v_minimumnum_v4f16_nnan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_min_f32_e32 v4, v5, v4 -; GFX7-SDAG-NEXT: v_min_f32_e32 v6, v7, v6 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll index 90632c663bf4a..7ed68dd6a00fe 100644 --- a/llvm/test/CodeGen/AMDGPU/omod.ll +++ b/llvm/test/CodeGen/AMDGPU/omod.ll @@ -1086,7 +1086,9 @@ define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, v0 div:2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -1142,7 +1144,9 @@ define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, v0 mul:2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -1197,7 +1201,9 @@ define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, v0 div:2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll index ff3a735bd32b4..2d3524d711788 100644 --- a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll +++ b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll @@ -251,15 +251,17 @@ define <2 x half> @v_repeat_divisor_f16_x2_arcp(half %x, half %y, half %D) #0 { ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, 1.0 ; GFX6-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX6-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0 +; GFX6-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-NEXT: v_mul_f32_e32 v6, v5, v4 ; GFX6-NEXT: v_fma_f32 v7, -v3, v6, v5 ; GFX6-NEXT: v_fma_f32 v6, v7, v4, v6 ; GFX6-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX6-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -533,15 +535,17 @@ define <3 x half> @v_repeat_divisor_f16_x3_arcp(half %x, half %y, half %z, half ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, 1.0 ; GFX6-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-NEXT: v_fma_f32 v6, -v4, v5, 1.0 -; GFX6-NEXT: v_fma_f32 v5, v6, v5, v5 ; GFX6-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0 +; GFX6-NEXT: v_fma_f32 v7, -v4, v5, 1.0 +; GFX6-NEXT: v_fma_f32 v5, v7, v5, v5 ; GFX6-NEXT: v_mul_f32_e32 v7, v6, v5 ; GFX6-NEXT: v_fma_f32 v8, -v4, v7, v6 ; GFX6-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-NEXT: v_div_fmas_f32 v4, v4, v5, v7 ; GFX6-NEXT: v_div_fixup_f32 v3, v4, v3, 1.0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v1 @@ -784,41 +788,45 @@ define <4 x half> @v_repeat_divisor_v2f16_x2(<2 x half> %x, <2 x half> %y, <2 x ; GFX6-LABEL: v_repeat_divisor_v2f16_x2: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX6-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, 1.0 -; GFX6-NEXT: v_rcp_f32_e32 v7, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX6-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, 1.0 +; GFX6-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_fma_f32 v8, -v6, v7, 1.0 -; GFX6-NEXT: v_fma_f32 v7, v8, v7, v7 -; GFX6-NEXT: v_div_scale_f32 v8, vcc, 1.0, v4, 1.0 -; GFX6-NEXT: v_mul_f32_e32 v9, v8, v7 -; GFX6-NEXT: v_fma_f32 v10, -v6, v9, v8 -; GFX6-NEXT: v_fma_f32 v9, v10, v7, v9 -; GFX6-NEXT: v_fma_f32 v6, -v6, v9, v8 -; GFX6-NEXT: v_div_fmas_f32 v6, v6, v7, v9 +; GFX6-NEXT: v_fma_f32 v7, -v4, v5, 1.0 +; GFX6-NEXT: v_fma_f32 v5, v7, v5, v5 +; GFX6-NEXT: v_div_scale_f32 v7, vcc, 1.0, v3, 1.0 +; GFX6-NEXT: v_mul_f32_e32 v8, v7, v5 +; GFX6-NEXT: v_fma_f32 v9, -v4, v8, v7 +; GFX6-NEXT: v_fma_f32 v8, v9, v5, v8 +; GFX6-NEXT: v_fma_f32 v4, -v4, v8, v7 ; GFX6-NEXT: v_div_scale_f32 v7, s[4:5], v2, v2, 1.0 -; GFX6-NEXT: v_rcp_f32_e32 v8, v7 -; GFX6-NEXT: v_div_fixup_f32 v4, v6, v4, 1.0 +; GFX6-NEXT: v_rcp_f32_e32 v9, v7 +; GFX6-NEXT: v_div_fmas_f32 v4, v4, v5, v8 +; GFX6-NEXT: v_div_fixup_f32 v3, v4, v3, 1.0 +; GFX6-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0 +; GFX6-NEXT: v_fma_f32 v4, -v7, v9, 1.0 +; GFX6-NEXT: v_fma_f32 v4, v4, v9, v9 +; GFX6-NEXT: v_mul_f32_e32 v8, v5, v4 +; GFX6-NEXT: v_fma_f32 v9, -v7, v8, v5 +; GFX6-NEXT: v_fma_f32 v8, v9, v4, v8 +; GFX6-NEXT: v_fma_f32 v5, -v7, v8, v5 +; GFX6-NEXT: v_div_fmas_f32 v4, v5, v4, v8 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_div_fixup_f32 v2, v4, v2, 1.0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v5, v5, v4 -; GFX6-NEXT: v_fma_f32 v6, -v7, v8, 1.0 -; GFX6-NEXT: v_fma_f32 v6, v6, v8, v8 -; GFX6-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0 -; GFX6-NEXT: v_mul_f32_e32 v9, v8, v6 -; GFX6-NEXT: v_fma_f32 v10, -v7, v9, v8 -; GFX6-NEXT: v_fma_f32 v9, v10, v6, v9 -; GFX6-NEXT: v_fma_f32 v7, -v7, v9, v8 -; GFX6-NEXT: v_div_fmas_f32 v6, v7, v6, v9 -; GFX6-NEXT: v_div_fixup_f32 v2, v6, v2, 1.0 +; GFX6-NEXT: v_mul_f32_e32 v5, v6, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX6-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 @@ -871,69 +879,75 @@ define <6 x half> @v_repeat_divisor_v3f16_x2(<3 x half> %x, <3 x half> %y, <3 x ; GFX6-LABEL: v_repeat_divisor_v3f16_x2: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX6-NEXT: v_div_scale_f32 v9, s[4:5], v4, v4, 1.0 -; GFX6-NEXT: v_rcp_f32_e32 v10, v9 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_div_scale_f32 v7, s[4:5], v6, v6, 1.0 +; GFX6-NEXT: v_rcp_f32_e32 v8, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_fma_f32 v9, -v7, v8, 1.0 +; GFX6-NEXT: v_fma_f32 v8, v9, v8, v8 +; GFX6-NEXT: v_div_scale_f32 v9, vcc, 1.0, v6, 1.0 +; GFX6-NEXT: v_mul_f32_e32 v10, v9, v8 +; GFX6-NEXT: v_fma_f32 v11, -v7, v10, v9 +; GFX6-NEXT: v_fma_f32 v10, v11, v8, v10 +; GFX6-NEXT: v_fma_f32 v7, -v7, v10, v9 +; GFX6-NEXT: v_div_fmas_f32 v7, v7, v8, v10 +; GFX6-NEXT: v_div_scale_f32 v8, s[4:5], v4, v4, 1.0 +; GFX6-NEXT: v_rcp_f32_e32 v9, v8 +; GFX6-NEXT: v_div_fixup_f32 v6, v7, v6, 1.0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX6-NEXT: v_fma_f32 v11, -v9, v10, 1.0 -; GFX6-NEXT: v_fma_f32 v10, v11, v10, v10 -; GFX6-NEXT: v_div_scale_f32 v11, vcc, 1.0, v4, 1.0 -; GFX6-NEXT: v_mul_f32_e32 v12, v11, v10 -; GFX6-NEXT: v_fma_f32 v13, -v9, v12, v11 -; GFX6-NEXT: v_fma_f32 v12, v13, v10, v12 -; GFX6-NEXT: v_fma_f32 v9, -v9, v12, v11 -; GFX6-NEXT: v_div_scale_f32 v11, s[4:5], v8, v8, 1.0 -; GFX6-NEXT: v_rcp_f32_e32 v13, v11 -; GFX6-NEXT: v_div_fmas_f32 v9, v9, v10, v12 -; GFX6-NEXT: v_div_fixup_f32 v4, v9, v4, 1.0 -; GFX6-NEXT: v_div_scale_f32 v10, vcc, 1.0, v8, 1.0 -; GFX6-NEXT: v_fma_f32 v9, -v11, v13, 1.0 -; GFX6-NEXT: v_fma_f32 v9, v9, v13, v13 -; GFX6-NEXT: v_mul_f32_e32 v12, v10, v9 -; GFX6-NEXT: v_fma_f32 v13, -v11, v12, v10 -; GFX6-NEXT: v_fma_f32 v12, v13, v9, v12 -; GFX6-NEXT: v_fma_f32 v10, -v11, v12, v10 -; GFX6-NEXT: v_div_scale_f32 v11, s[4:5], v5, v5, 1.0 -; GFX6-NEXT: v_rcp_f32_e32 v13, v11 -; GFX6-NEXT: v_div_fmas_f32 v9, v10, v9, v12 -; GFX6-NEXT: v_div_fixup_f32 v8, v9, v8, 1.0 -; GFX6-NEXT: v_div_scale_f32 v10, vcc, 1.0, v5, 1.0 -; GFX6-NEXT: v_fma_f32 v9, -v11, v13, 1.0 -; GFX6-NEXT: v_fma_f32 v9, v9, v13, v13 -; GFX6-NEXT: v_mul_f32_e32 v12, v10, v9 -; GFX6-NEXT: v_fma_f32 v13, -v11, v12, v10 -; GFX6-NEXT: v_fma_f32 v12, v13, v9, v12 -; GFX6-NEXT: v_fma_f32 v10, -v11, v12, v10 -; GFX6-NEXT: v_div_fmas_f32 v9, v10, v9, v12 -; GFX6-NEXT: v_div_fixup_f32 v5, v9, v5, 1.0 +; GFX6-NEXT: v_fma_f32 v10, -v8, v9, 1.0 +; GFX6-NEXT: v_fma_f32 v9, v10, v9, v9 +; GFX6-NEXT: v_div_scale_f32 v10, vcc, 1.0, v4, 1.0 +; GFX6-NEXT: v_mul_f32_e32 v11, v10, v9 +; GFX6-NEXT: v_fma_f32 v12, -v8, v11, v10 +; GFX6-NEXT: v_fma_f32 v11, v12, v9, v11 +; GFX6-NEXT: v_fma_f32 v8, -v8, v11, v10 +; GFX6-NEXT: v_div_scale_f32 v10, s[4:5], v5, v5, 1.0 +; GFX6-NEXT: v_rcp_f32_e32 v12, v10 +; GFX6-NEXT: v_div_fmas_f32 v8, v8, v9, v11 +; GFX6-NEXT: v_div_fixup_f32 v4, v8, v4, 1.0 +; GFX6-NEXT: v_div_scale_f32 v9, vcc, 1.0, v5, 1.0 +; GFX6-NEXT: v_fma_f32 v8, -v10, v12, 1.0 +; GFX6-NEXT: v_fma_f32 v8, v8, v12, v12 +; GFX6-NEXT: v_mul_f32_e32 v11, v9, v8 +; GFX6-NEXT: v_fma_f32 v12, -v10, v11, v9 +; GFX6-NEXT: v_fma_f32 v11, v12, v8, v11 +; GFX6-NEXT: v_fma_f32 v9, -v10, v11, v9 +; GFX6-NEXT: v_div_fmas_f32 v8, v9, v8, v11 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_div_fixup_f32 v5, v8, v5, 1.0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX6-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_mul_f32_e32 v7, v7, v6 ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX6-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX6-NEXT: v_mul_f32_e32 v7, v7, v8 -; GFX6-NEXT: v_mul_f32_e32 v4, v6, v8 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_f32_e32 v6, v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v7 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_repeat_divisor_v3f16_x2: diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll index 99d494d4feaf4..8920bfbd3b9dc 100644 --- a/llvm/test/CodeGen/AMDGPU/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll @@ -821,46 +821,46 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) { ; SDAG_GFX6-LABEL: v_roundeven_v4f16: ; SDAG_GFX6: ; %bb.0: ; SDAG_GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG_GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SDAG_GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SDAG_GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SDAG_GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG_GFX6-NEXT: v_rndne_f32_e32 v2, v2 ; SDAG_GFX6-NEXT: v_rndne_f32_e32 v3, v3 -; SDAG_GFX6-NEXT: v_rndne_f32_e32 v0, v0 ; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SDAG_GFX6-NEXT: v_rndne_f32_e32 v1, v1 +; SDAG_GFX6-NEXT: v_rndne_f32_e32 v0, v0 ; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SDAG_GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SDAG_GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; SDAG_GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SDAG_GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SDAG_GFX6-NEXT: v_or_b32_e32 v0, v0, v3 ; SDAG_GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX7-LABEL: v_roundeven_v4f16: ; SDAG_GFX7: ; %bb.0: ; SDAG_GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG_GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SDAG_GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SDAG_GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SDAG_GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG_GFX7-NEXT: v_rndne_f32_e32 v2, v2 ; SDAG_GFX7-NEXT: v_rndne_f32_e32 v3, v3 -; SDAG_GFX7-NEXT: v_rndne_f32_e32 v0, v0 ; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SDAG_GFX7-NEXT: v_rndne_f32_e32 v1, v1 +; SDAG_GFX7-NEXT: v_rndne_f32_e32 v0, v0 ; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SDAG_GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SDAG_GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; SDAG_GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SDAG_GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SDAG_GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; SDAG_GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; SDAG_GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll index 6d4b1c4621054..b2317cd653842 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll @@ -13,12 +13,11 @@ define half @add_select_fabs_fabs_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fabs_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e64 v0, |v0|, v3 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -77,15 +76,15 @@ define { half, half } @add_select_multi_use_lhs_fabs_fabs_f16(i32 %c, half %x, h ; CI-LABEL: add_select_multi_use_lhs_fabs_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e64 v0, |v0|, v4 -; CI-NEXT: v_add_f32_e64 v1, |v1|, v3 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; CI-NEXT: v_add_f32_e32 v0, v0, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_add_f32_e32 v1, v1, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -152,14 +151,13 @@ define { half, half } @add_select_multi_store_use_lhs_fabs_fabs_f16(i32 %c, half ; CI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e64 v0, |v0|, v3 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e64 v1, |v1| ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_f16: @@ -225,15 +223,15 @@ define { half, half } @add_select_multi_use_rhs_fabs_fabs_f16(i32 %c, half %x, h ; CI-LABEL: add_select_multi_use_rhs_fabs_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e64 v0, |v0|, v3 -; CI-NEXT: v_add_f32_e64 v1, |v2|, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_add_f32_e32 v1, v2, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -300,12 +298,12 @@ define half @add_select_fabs_var_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fabs_var_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -368,11 +366,13 @@ define half @add_select_fabs_negk_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_fabs_negk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; CI-NEXT: v_mov_b32_e32 v3, 0xffffbc00 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v1, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -437,9 +437,12 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) { ; CI-LABEL: add_select_fabs_negk_negk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mov_b32_e32 v2, 0xbc00 +; CI-NEXT: v_mov_b32_e32 v3, 0xc000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -504,9 +507,12 @@ define half @add_select_posk_posk_f16(i32 %c, half %x) { ; CI-LABEL: add_select_posk_posk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mov_b32_e32 v2, 0x3c00 +; CI-NEXT: v_mov_b32_e32 v3, 0x4000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e64 v0, 1.0, 2.0, vcc +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -570,11 +576,13 @@ define half @add_select_negk_fabs_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_negk_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; CI-NEXT: v_mov_b32_e32 v3, 0xffffbc00 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v1, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -638,12 +646,13 @@ define half @add_select_negliteralk_fabs_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_negliteralk_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_mov_b32_e32 v3, 0xc4800000 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; CI-NEXT: v_mov_b32_e32 v3, 0xffffe400 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -707,11 +716,12 @@ define half @add_select_fabs_posk_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_fabs_posk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mov_b32_e32 v3, 0x3c00 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; CI-NEXT: v_add_f32_e64 v0, |v0|, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -770,11 +780,12 @@ define half @add_select_posk_fabs_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_posk_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mov_b32_e32 v3, 0x3c00 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; CI-NEXT: v_add_f32_e64 v0, |v0|, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -833,12 +844,11 @@ define half @add_select_fneg_fneg_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fneg_fneg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v3, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -897,15 +907,15 @@ define { half, half } @add_select_multi_use_lhs_fneg_fneg_f16(i32 %c, half %x, h ; CI-LABEL: add_select_multi_use_lhs_fneg_fneg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v3, v0 -; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_sub_f32_e32 v0, v2, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_sub_f32_e32 v1, v3, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -972,15 +982,13 @@ define { half, half } @add_select_multi_store_use_lhs_fneg_fneg_f16(i32 %c, half ; CI-LABEL: add_select_multi_store_use_lhs_fneg_fneg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; CI-NEXT: v_sub_f32_e32 v0, v3, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; CI-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; CI-NEXT: v_sub_f32_e32 v0, v2, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_multi_store_use_lhs_fneg_fneg_f16: @@ -1046,15 +1054,15 @@ define { half, half } @add_select_multi_use_rhs_fneg_fneg_f16(i32 %c, half %x, h ; CI-LABEL: add_select_multi_use_rhs_fneg_fneg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v3, v0 -; CI-NEXT: v_sub_f32_e32 v1, v4, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_sub_f32_e32 v1, v3, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1121,13 +1129,12 @@ define half @add_select_fneg_var_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fneg_var_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e64 v1, -v1 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1190,11 +1197,12 @@ define half @add_select_fneg_negk_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_fneg_negk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mov_b32_e32 v3, 0x3c00 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1253,12 +1261,12 @@ define half @add_select_fneg_inv2pi_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_fneg_inv2pi_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_mov_b32_e32 v3, 0xbe230000 +; CI-NEXT: v_mov_b32_e32 v3, 0xffffb118 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1317,12 +1325,12 @@ define half @add_select_fneg_neginv2pi_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_fneg_neginv2pi_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_mov_b32_e32 v3, 0x3e230000 +; CI-NEXT: v_mov_b32_e32 v3, 0x3118 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1381,9 +1389,12 @@ define half @add_select_negk_negk_f16(i32 %c, half %x) { ; CI-LABEL: add_select_negk_negk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mov_b32_e32 v2, 0xbc00 +; CI-NEXT: v_mov_b32_e32 v3, 0xc000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -1447,11 +1458,12 @@ define half @add_select_negliteralk_negliteralk_f16(i32 %c, half %x) { ; CI-LABEL: add_select_negliteralk_negliteralk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_mov_b32_e32 v2, 0xc5800000 -; CI-NEXT: v_mov_b32_e32 v3, 0xc5000000 +; CI-NEXT: v_mov_b32_e32 v2, 0xec00 +; CI-NEXT: v_mov_b32_e32 v3, 0xe800 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -1515,9 +1527,12 @@ define half @add_select_fneg_negk_negk_f16(i32 %c, half %x) { ; CI-LABEL: add_select_fneg_negk_negk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mov_b32_e32 v2, 0xbc00 +; CI-NEXT: v_mov_b32_e32 v3, 0xc000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -1582,11 +1597,12 @@ define half @add_select_negk_fneg_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_negk_fneg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mov_b32_e32 v3, 0x3c00 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1645,11 +1661,12 @@ define half @add_select_fneg_posk_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_fneg_posk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mov_b32_e32 v3, 0xffffbc00 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1708,11 +1725,12 @@ define half @add_select_posk_fneg_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_posk_fneg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mov_b32_e32 v3, 0xffffbc00 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1771,13 +1789,13 @@ define half @add_select_negfabs_fabs_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_negfabs_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1847,13 +1865,13 @@ define half @add_select_fabs_negfabs_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fabs_negfabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; CI-NEXT: v_cvt_f32_f16_e64 v2, -|v2| -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; CI-NEXT: v_or_b32_e32 v2, 0x8000, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1923,13 +1941,13 @@ define half @add_select_neg_fabs_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_neg_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v1, -v1 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1998,13 +2016,13 @@ define half @add_select_fabs_neg_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fabs_neg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; CI-NEXT: v_cvt_f32_f16_e64 v2, -v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; CI-NEXT: v_xor_b32_e32 v2, 0x8000, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2073,12 +2091,12 @@ define half @add_select_neg_negfabs_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_neg_negfabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; CI-NEXT: v_sub_f32_e32 v0, v3, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2143,12 +2161,12 @@ define half @add_select_negfabs_neg_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_negfabs_neg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; CI-NEXT: v_sub_f32_e32 v0, v3, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2213,12 +2231,13 @@ define half @mul_select_negfabs_posk_f16(i32 %c, half %x, half %y) { ; CI-LABEL: mul_select_negfabs_posk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; CI-NEXT: v_mov_b32_e32 v3, 0x4400 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v1, vcc -; CI-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2283,12 +2302,13 @@ define half @mul_select_posk_negfabs_f16(i32 %c, half %x, half %y) { ; CI-LABEL: mul_select_posk_negfabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; CI-NEXT: v_mov_b32_e32 v3, 0x4400 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v1, vcc -; CI-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2353,12 +2373,13 @@ define half @mul_select_negfabs_negk_f16(i32 %c, half %x, half %y) { ; CI-LABEL: mul_select_negfabs_negk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; CI-NEXT: v_mov_b32_e32 v3, 0xffffc400 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v1, vcc -; CI-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2423,12 +2444,13 @@ define half @mul_select_negk_negfabs_f16(i32 %c, half %x, half %y) { ; CI-LABEL: mul_select_negk_negfabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; CI-NEXT: v_mov_b32_e32 v3, 0xffffc400 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v1, vcc -; CI-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2498,10 +2520,12 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) { ; CI-SAFE: ; %bb.0: ; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 ; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-SAFE-NEXT: v_add_f32_e32 v1, 4.0, v1 -; CI-SAFE-NEXT: v_cndmask_b32_e64 v0, 2.0, -v1, vcc -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-SAFE-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; CI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: select_fneg_posk_src_add_f16: @@ -2538,10 +2562,11 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) { ; CI-NSZ: ; %bb.0: ; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NSZ-NEXT: v_mov_b32_e32 v2, 0x4000 ; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NSZ-NEXT: v_sub_f32_e32 v1, -4.0, v1 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-NSZ-LABEL: select_fneg_posk_src_add_f16: @@ -2582,10 +2607,12 @@ define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) { ; CI-SAFE: ; %bb.0: ; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 ; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-SAFE-NEXT: v_add_f32_e32 v1, -4.0, v1 -; CI-SAFE-NEXT: v_cndmask_b32_e64 v0, 2.0, -v1, vcc -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-SAFE-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; CI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: select_fneg_posk_src_sub_f16: @@ -2622,10 +2649,11 @@ define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) { ; CI-NSZ: ; %bb.0: ; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NSZ-NEXT: v_mov_b32_e32 v2, 0x4000 ; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NSZ-NEXT: v_sub_f32_e32 v1, 4.0, v1 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-NSZ-LABEL: select_fneg_posk_src_sub_f16: @@ -2666,10 +2694,11 @@ define half @select_fneg_posk_src_mul_f16(i32 %c, half %x) { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mov_b32_e32 v2, 0x4000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_mul_f32_e32 v1, -4.0, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: select_fneg_posk_src_mul_f16: @@ -2728,11 +2757,57 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) { ; CI-SAFE: ; %bb.0: ; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; CI-SAFE-NEXT: s_movk_i32 s4, 0x3f1 +; CI-SAFE-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; CI-SAFE-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 +; CI-SAFE-NEXT: v_fma_f64 v[1:2], v[3:4], 4.0, v[1:2] +; CI-SAFE-NEXT: v_and_b32_e32 v3, 0x1ff, v2 +; CI-SAFE-NEXT: v_or_b32_e32 v1, v3, v1 +; CI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; CI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CI-SAFE-NEXT: v_and_b32_e32 v3, 0xffe, v4 +; CI-SAFE-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CI-SAFE-NEXT: v_bfe_u32 v4, v2, 20, 11 +; CI-SAFE-NEXT: v_or_b32_e32 v1, v3, v1 +; CI-SAFE-NEXT: v_sub_i32_e32 v5, vcc, s4, v4 +; CI-SAFE-NEXT: v_or_b32_e32 v3, 0x1000, v1 +; CI-SAFE-NEXT: v_med3_i32 v5, v5, 0, 13 +; CI-SAFE-NEXT: v_lshrrev_b32_e32 v6, v5, v3 +; CI-SAFE-NEXT: v_lshlrev_b32_e32 v5, v5, v6 +; CI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, v5, v3 +; CI-SAFE-NEXT: s_movk_i32 s4, 0xfc10 +; CI-SAFE-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CI-SAFE-NEXT: v_add_i32_e32 v4, vcc, s4, v4 +; CI-SAFE-NEXT: v_lshlrev_b32_e32 v5, 12, v4 +; CI-SAFE-NEXT: v_or_b32_e32 v3, v6, v3 +; CI-SAFE-NEXT: v_or_b32_e32 v5, v1, v5 +; CI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 +; CI-SAFE-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CI-SAFE-NEXT: v_and_b32_e32 v5, 7, v3 +; CI-SAFE-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; CI-SAFE-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; CI-SAFE-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CI-SAFE-NEXT: v_or_b32_e32 v5, v5, v6 +; CI-SAFE-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; CI-SAFE-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CI-SAFE-NEXT: v_mov_b32_e32 v5, 0x7c00 +; CI-SAFE-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 +; CI-SAFE-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CI-SAFE-NEXT: v_mov_b32_e32 v6, 0x7e00 +; CI-SAFE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CI-SAFE-NEXT: s_movk_i32 s4, 0x40f +; CI-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4 +; CI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CI-SAFE-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; CI-SAFE-NEXT: v_or_b32_e32 v1, v2, v1 +; CI-SAFE-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; CI-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 ; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-SAFE-NEXT: v_fma_f32 v1, v1, 4.0, v2 -; CI-SAFE-NEXT: v_cndmask_b32_e64 v0, 2.0, -v1, vcc -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: select_fneg_posk_src_fma_f16: @@ -2768,12 +2843,57 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) { ; CI-NSZ-LABEL: select_fneg_posk_src_fma_f16: ; CI-NSZ: ; %bb.0: ; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NSZ-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v1 +; CI-NSZ-NEXT: s_movk_i32 s4, 0x3f1 +; CI-NSZ-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; CI-NSZ-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 +; CI-NSZ-NEXT: v_fma_f64 v[1:2], v[3:4], -4.0, v[1:2] +; CI-NSZ-NEXT: v_and_b32_e32 v3, 0x1ff, v2 +; CI-NSZ-NEXT: v_or_b32_e32 v1, v3, v1 +; CI-NSZ-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; CI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CI-NSZ-NEXT: v_and_b32_e32 v3, 0xffe, v4 +; CI-NSZ-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CI-NSZ-NEXT: v_bfe_u32 v4, v2, 20, 11 +; CI-NSZ-NEXT: v_or_b32_e32 v1, v3, v1 +; CI-NSZ-NEXT: v_sub_i32_e32 v5, vcc, s4, v4 +; CI-NSZ-NEXT: v_or_b32_e32 v3, 0x1000, v1 +; CI-NSZ-NEXT: v_med3_i32 v5, v5, 0, 13 +; CI-NSZ-NEXT: v_lshrrev_b32_e32 v6, v5, v3 +; CI-NSZ-NEXT: v_lshlrev_b32_e32 v5, v5, v6 +; CI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, v5, v3 +; CI-NSZ-NEXT: s_movk_i32 s4, 0xfc10 +; CI-NSZ-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CI-NSZ-NEXT: v_add_i32_e32 v4, vcc, s4, v4 +; CI-NSZ-NEXT: v_lshlrev_b32_e32 v5, 12, v4 +; CI-NSZ-NEXT: v_or_b32_e32 v3, v6, v3 +; CI-NSZ-NEXT: v_or_b32_e32 v5, v1, v5 +; CI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 +; CI-NSZ-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CI-NSZ-NEXT: v_and_b32_e32 v5, 7, v3 +; CI-NSZ-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; CI-NSZ-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; CI-NSZ-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CI-NSZ-NEXT: v_or_b32_e32 v5, v5, v6 +; CI-NSZ-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; CI-NSZ-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CI-NSZ-NEXT: v_mov_b32_e32 v5, 0x7c00 +; CI-NSZ-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 +; CI-NSZ-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CI-NSZ-NEXT: v_mov_b32_e32 v6, 0x7e00 +; CI-NSZ-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CI-NSZ-NEXT: s_movk_i32 s4, 0x40f +; CI-NSZ-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4 +; CI-NSZ-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NSZ-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CI-NSZ-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; CI-NSZ-NEXT: v_or_b32_e32 v1, v2, v1 +; CI-NSZ-NEXT: v_mov_b32_e32 v2, 0x4000 ; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NSZ-NEXT: v_fma_f32 v1, v1, -4.0, -v2 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-NSZ-LABEL: select_fneg_posk_src_fma_f16: @@ -2817,9 +2937,13 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) { ; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-SAFE-NEXT: v_add_f32_e32 v1, v1, v2 -; CI-SAFE-NEXT: v_cndmask_b32_e64 v0, 2.0, -v1, vcc -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 +; CI-SAFE-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; CI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: select_fneg_posk_src_fmad_f16: @@ -2859,9 +2983,12 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) { ; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v1 +; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NSZ-NEXT: v_sub_f32_e32 v1, v1, v2 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NSZ-NEXT: v_mov_b32_e32 v2, 0x4000 +; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-NSZ-LABEL: select_fneg_posk_src_fmad_f16: diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll index b0e920478e3a5..c026a42993d48 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll @@ -9,22 +9,21 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_fabs_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| -; CI-NEXT: v_cvt_f32_f16_e64 v7, |v7| -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; CI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; CI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -101,26 +100,28 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x ; CI-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v8, |v8| -; CI-NEXT: v_cvt_f32_f16_e64 v9, |v9| +; CI-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v2 +; CI-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_bfe_u32 v9, v2, 16, 15 +; CI-NEXT: v_bfe_u32 v3, v3, 16, 15 +; CI-NEXT: v_cndmask_b32_e32 v0, v8, v7, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v9 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v7 +; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v5 -; CI-NEXT: v_add_f32_e32 v3, v8, v6 +; CI-NEXT: v_add_f32_e32 v3, v5, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_add_f32_e32 v2, v2, v4 @@ -213,26 +214,25 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fabs_fabs_v2f1 ; CI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; CI-NEXT: v_bfe_u32 v7, v2, 16, 15 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; CI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v8, |v2| -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; CI-NEXT: v_add_f32_e32 v1, v1, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; CI-NEXT: v_or_b32_e32 v0, v0, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 +; CI-NEXT: v_mov_b32_e32 v1, v2 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16: @@ -310,26 +310,28 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x ; CI-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v8, |v8| -; CI-NEXT: v_cvt_f32_f16_e64 v9, |v9| +; CI-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v2 +; CI-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_bfe_u32 v2, v2, 16, 15 +; CI-NEXT: v_bfe_u32 v9, v3, 16, 15 +; CI-NEXT: v_cndmask_b32_e32 v0, v8, v7, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; CI-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; CI-NEXT: v_add_f32_e32 v1, v1, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v9 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v7 +; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v4 -; CI-NEXT: v_add_f32_e32 v2, v9, v6 +; CI-NEXT: v_add_f32_e32 v2, v4, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_add_f32_e32 v3, v3, v5 @@ -422,22 +424,20 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; CI-LABEL: add_select_fabs_var_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; CI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -508,17 +508,19 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_fabs_negk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 +; CI-NEXT: v_mov_b32_e32 v6, 0xffffbc00 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_bfe_u32 v2, v2, 16, 15 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v5, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -593,13 +595,17 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) ; CI-LABEL: add_select_fabs_negk_negk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, 0xbc00 +; CI-NEXT: v_mov_b32_e32 v5, 0xc000 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc ; CI-NEXT: v_sub_f32_e32 v1, v3, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v2, v0 @@ -672,13 +678,17 @@ define <2 x half> @add_select_posk_posk_v2f16(<2 x i32> %c, <2 x half> %x) { ; CI-LABEL: add_select_posk_posk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, 0x3c00 +; CI-NEXT: v_mov_b32_e32 v5, 0x4000 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e64 v0, 1.0, 2.0, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc ; CI-NEXT: v_add_f32_e32 v1, v1, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v2 @@ -750,17 +760,19 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_negk_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 +; CI-NEXT: v_mov_b32_e32 v6, 0xffffbc00 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CI-NEXT: v_bfe_u32 v2, v2, 16, 15 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v5, vcc -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -834,18 +846,19 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x ; CI-LABEL: add_select_negliteralk_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 +; CI-NEXT: v_mov_b32_e32 v6, 0xffffe400 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CI-NEXT: v_bfe_u32 v2, v2, 16, 15 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_mov_b32_e32 v6, 0xc4800000 -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -919,17 +932,19 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_fabs_posk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 +; CI-NEXT: v_mov_b32_e32 v6, 0x3c00 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_bfe_u32 v2, v2, 16, 15 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1003,17 +1018,19 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_posk_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 +; CI-NEXT: v_mov_b32_e32 v6, 0x3c00 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CI-NEXT: v_bfe_u32 v2, v2, 16, 15 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v5, |v5| -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1087,22 +1104,19 @@ define <2 x half> @add_select_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI-LABEL: add_select_fneg_fneg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v5, v1 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_sub_f32_e32 v0, v4, v0 +; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1169,26 +1183,26 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fneg_fneg_v2f16(<2 x ; CI-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v8, v7, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v7, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; CI-NEXT: v_sub_f32_e32 v1, v3, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v7 ; CI-NEXT: v_sub_f32_e32 v0, v4, v0 -; CI-NEXT: v_sub_f32_e32 v3, v6, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_sub_f32_e32 v3, v4, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_sub_f32_e32 v2, v5, v2 @@ -1269,20 +1283,17 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fneg_fneg_v2f1 ; CI-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc -; CI-NEXT: v_sub_f32_e32 v1, v5, v1 +; CI-NEXT: v_sub_f32_e32 v1, v3, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v1 ; CI-NEXT: v_sub_f32_e32 v0, v4, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1358,26 +1369,26 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fneg_fneg_v2f16(<2 x ; CI-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v8, v7, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v7, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v8 ; CI-NEXT: v_sub_f32_e32 v0, v4, v0 -; CI-NEXT: v_sub_f32_e32 v2, v6, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_sub_f32_e32 v2, v4, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_sub_f32_e32 v3, v5, v3 @@ -1459,22 +1470,19 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1546,16 +1554,17 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v6, 0x3c00 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1625,17 +1634,17 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_mov_b32_e32 v6, 0xbe230000 +; CI-NEXT: v_mov_b32_e32 v6, 0xffffb118 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1705,17 +1714,17 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x, ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_mov_b32_e32 v6, 0x3e230000 +; CI-NEXT: v_mov_b32_e32 v6, 0x3118 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1784,13 +1793,17 @@ define <2 x half> @add_select_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) { ; CI-LABEL: add_select_negk_negk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, 0xbc00 +; CI-NEXT: v_mov_b32_e32 v5, 0xc000 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc ; CI-NEXT: v_add_f32_e32 v1, v1, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v2 @@ -1862,15 +1875,17 @@ define <2 x half> @add_select_negliteralk_negliteralk_v2f16(<2 x i32> %c, <2 x h ; CI-LABEL: add_select_negliteralk_negliteralk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_mov_b32_e32 v4, 0xc5800000 -; CI-NEXT: v_mov_b32_e32 v5, 0xc5000000 +; CI-NEXT: v_mov_b32_e32 v4, 0xec00 +; CI-NEXT: v_mov_b32_e32 v5, 0xe800 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_add_f32_e32 v1, v1, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v2 @@ -1942,13 +1957,17 @@ define <2 x half> @add_select_fneg_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) ; CI-LABEL: add_select_fneg_negk_negk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, 0xbc00 +; CI-NEXT: v_mov_b32_e32 v5, 0xc000 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc ; CI-NEXT: v_sub_f32_e32 v1, v3, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v2, v0 @@ -2022,16 +2041,17 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v6, 0x3c00 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2101,16 +2121,17 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v6, 0xffffbc00 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v5, vcc +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2180,16 +2201,17 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v6, 0xffffbc00 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, -1.0, v5, vcc +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2258,23 +2280,21 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI-LABEL: add_select_negfabs_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2352,23 +2372,21 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI-LABEL: add_select_fabs_negfabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; CI-NEXT: v_or_b32_e32 v3, 0x80008000, v3 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2446,23 +2464,21 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; CI-LABEL: add_select_neg_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2539,23 +2555,21 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; CI-LABEL: add_select_fabs_neg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; CI-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_add_f32_e32 v1, v1, v5 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2632,22 +2646,20 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI-LABEL: add_select_neg_negfabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e64 v7, |v7| -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e64 v3, |v3| -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; CI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; CI-NEXT: v_sub_f32_e32 v1, v5, v1 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_sub_f32_e32 v0, v4, v0 +; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2720,22 +2732,20 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI-LABEL: add_select_negfabs_neg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e64 v7, |v7| -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; CI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; CI-NEXT: v_sub_f32_e32 v1, v5, v1 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; CI-NEXT: v_sub_f32_e32 v1, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_sub_f32_e32 v0, v4, v0 +; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2809,17 +2819,18 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v6, 0x4400 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v5, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc -; CI-NEXT: v_mul_f32_e32 v1, v1, v4 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mul_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_mul_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2895,17 +2906,18 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v6, 0x4400 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v5, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc -; CI-NEXT: v_mul_f32_e32 v1, v1, v4 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mul_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_mul_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -2981,17 +2993,18 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v6, 0xffffc400 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v5, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, -4.0, v2, vcc -; CI-NEXT: v_mul_f32_e32 v1, v1, v4 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mul_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_mul_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3067,17 +3080,18 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v6, 0xffffc400 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v5, vcc +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, -4.0, v2, vcc -; CI-NEXT: v_mul_f32_e32 v1, v1, v4 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mul_f32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_mul_f32_e32 v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3159,7 +3173,8 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_mov_b32_e32 v4, 0x4000 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_add_f32_e32 v3, 4.0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_add_f32_e32 v2, 4.0, v2 @@ -3168,13 +3183,10 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -3248,16 +3260,17 @@ define <2 x half> @select_fneg_posk_src_add_v2f16_nsz(<2 x i32> %c, <2 x half> % ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_sub_f32_e32 v3, -4.0, v3 -; CI-NEXT: v_sub_f32_e32 v2, -4.0, v2 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v4, 0x4000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_sub_f32_e32 v2, -4.0, v2 +; CI-NEXT: v_sub_f32_e32 v3, -4.0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -3327,7 +3340,8 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_mov_b32_e32 v4, 0x4000 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_add_f32_e32 v3, -4.0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_add_f32_e32 v2, -4.0, v2 @@ -3336,13 +3350,10 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -3416,16 +3427,17 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16_nsz(<2 x i32> %c, <2 x half> % ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_sub_f32_e32 v3, 4.0, v3 -; CI-NEXT: v_sub_f32_e32 v2, 4.0, v2 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v4, 0x4000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_sub_f32_e32 v2, 4.0, v2 +; CI-NEXT: v_sub_f32_e32 v3, 4.0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -3493,16 +3505,17 @@ define <2 x half> @select_fneg_posk_src_mul_v2f16(<2 x i32> %c, <2 x half> %x) { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_mul_f32_e32 v3, -4.0, v3 -; CI-NEXT: v_mul_f32_e32 v2, -4.0, v2 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mov_b32_e32 v4, 0x4000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_mul_f32_e32 v2, -4.0, v2 +; CI-NEXT: v_mul_f32_e32 v3, -4.0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -3569,28 +3582,110 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < ; CI-LABEL: select_fneg_posk_src_fma_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; CI-NEXT: s_movk_i32 s4, 0x3f1 +; CI-NEXT: s_movk_i32 s5, 0xfc10 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_fma_f32 v4, v5, 4.0, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_fma_f32 v2, v2, 4.0, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: v_fma_f64 v[4:5], v[6:7], 4.0, v[4:5] +; CI-NEXT: v_cvt_f32_f16_e32 v10, v2 +; CI-NEXT: v_and_b32_e32 v6, 0x1ff, v5 +; CI-NEXT: v_or_b32_e32 v4, v6, v4 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CI-NEXT: v_lshrrev_b32_e32 v6, 8, v5 +; CI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CI-NEXT: v_and_b32_e32 v6, 0xffe, v6 +; CI-NEXT: v_bfe_u32 v7, v5, 20, 11 +; CI-NEXT: v_or_b32_e32 v4, v6, v4 +; CI-NEXT: v_sub_i32_e32 v8, vcc, s4, v7 +; CI-NEXT: v_or_b32_e32 v6, 0x1000, v4 +; CI-NEXT: v_med3_i32 v8, v8, 0, 13 +; CI-NEXT: v_lshrrev_b32_e32 v9, v8, v6 +; CI-NEXT: v_lshlrev_b32_e32 v8, v8, v9 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v6 +; CI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CI-NEXT: v_add_i32_e32 v7, vcc, s5, v7 +; CI-NEXT: v_lshlrev_b32_e32 v8, 12, v7 +; CI-NEXT: v_or_b32_e32 v6, v9, v6 +; CI-NEXT: v_or_b32_e32 v8, v4, v8 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 +; CI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; CI-NEXT: v_and_b32_e32 v8, 7, v6 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8 +; CI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 +; CI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CI-NEXT: v_or_b32_e32 v8, v8, v9 +; CI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 +; CI-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CI-NEXT: v_mov_b32_e32 v8, 0x7c00 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 +; CI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; CI-NEXT: v_mov_b32_e32 v9, 0x7e00 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CI-NEXT: s_movk_i32 s6, 0x40f +; CI-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; CI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v10 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_and_b32_e32 v5, 0x8000, v5 +; CI-NEXT: v_or_b32_e32 v4, v5, v4 +; CI-NEXT: v_fma_f64 v[2:3], v[6:7], 4.0, v[2:3] +; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; CI-NEXT: v_and_b32_e32 v5, 0x1ff, v3 +; CI-NEXT: v_or_b32_e32 v2, v5, v2 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; CI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CI-NEXT: v_and_b32_e32 v5, 0xffe, v5 +; CI-NEXT: v_bfe_u32 v6, v3, 20, 11 +; CI-NEXT: v_or_b32_e32 v2, v5, v2 +; CI-NEXT: v_sub_i32_e32 v7, vcc, s4, v6 +; CI-NEXT: v_or_b32_e32 v5, 0x1000, v2 +; CI-NEXT: v_med3_i32 v7, v7, 0, 13 +; CI-NEXT: v_lshrrev_b32_e32 v10, v7, v5 +; CI-NEXT: v_lshlrev_b32_e32 v7, v7, v10 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v5 +; CI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CI-NEXT: v_add_i32_e32 v6, vcc, s5, v6 +; CI-NEXT: v_lshlrev_b32_e32 v7, 12, v6 +; CI-NEXT: v_or_b32_e32 v5, v10, v5 +; CI-NEXT: v_or_b32_e32 v7, v2, v7 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 +; CI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; CI-NEXT: v_and_b32_e32 v7, 7, v5 +; CI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 +; CI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 +; CI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CI-NEXT: v_or_b32_e32 v7, v7, v10 +; CI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 +; CI-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 +; CI-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; CI-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; CI-NEXT: v_or_b32_e32 v2, v3, v2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_or_b32_e32 v2, v4, v2 ; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: v_mov_b32_e32 v4, 0x4000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -3665,30 +3760,32 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; CI-LABEL: select_fneg_posk_src_fmad_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_mul_f32_e32 v5, 4.0, v5 -; CI-NEXT: v_add_f32_e32 v4, v5, v4 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_mul_f32_e32 v4, 4.0, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_mul_f32_e32 v2, 4.0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_add_f32_e32 v4, v4, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_add_f32_e32 v2, v2, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; CI-NEXT: v_mov_b32_e32 v4, 0x4000 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] @@ -3763,22 +3860,27 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16_nsz(<2 x i32> %c, <2 x half> ; CI-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_mul_f32_e32 v5, -4.0, v5 -; CI-NEXT: v_sub_f32_e32 v4, v5, v4 ; CI-NEXT: v_mul_f32_e32 v2, -4.0, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_sub_f32_e32 v2, v2, v3 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v4, vcc +; CI-NEXT: v_mul_f32_e32 v4, -4.0, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_sub_f32_e32 v2, v2, v3 +; CI-NEXT: v_sub_f32_e32 v4, v4, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; CI-NEXT: v_mov_b32_e32 v4, 0x4000 +; CI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll index c7422a25f71e7..59c0f1cc7782f 100644 --- a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll @@ -521,10 +521,10 @@ define half @v_test_fmin_legacy_ule_f16_safe(half %a, half %b) { ; GFX7-LABEL: v_test_fmin_legacy_ule_f16_safe: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_f16_safe: @@ -566,10 +566,10 @@ define half @v_test_fmin_legacy_ule_f16_nnan_flag(half %a, half %b) { ; GFX7-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag: @@ -611,10 +611,10 @@ define half @v_test_fmin_legacy_ule_f16_nsz_flag(half %a, half %b) { ; GFX7-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag: @@ -696,10 +696,10 @@ define half @v_test_fmax_legacy_uge_f16_safe(half %a, half %b) { ; GFX7-LABEL: v_test_fmax_legacy_uge_f16_safe: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_f16_safe: @@ -741,10 +741,10 @@ define half @v_test_fmax_legacy_uge_f16_nnan_flag(half %a, half %b) { ; GFX7-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag: @@ -786,10 +786,10 @@ define half @v_test_fmax_legacy_uge_f16_nsz_flag(half %a, half %b) { ; GFX7-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v3, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag: @@ -873,16 +873,17 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_safe(<2 x half> %a, <2 x half> % ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min_legacy_f32_e32 v2, v3, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_safe: @@ -944,16 +945,17 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_flag(<2 x half> %a, <2 x ha ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min_legacy_f32_e32 v2, v3, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_flag: @@ -1015,16 +1017,17 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nsz_flag(<2 x half> %a, <2 x hal ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min_legacy_f32_e32 v2, v3, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_nsz_flag: @@ -1084,13 +1087,13 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag(<2 x half> %a, <2 ; GFX7-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1124,16 +1127,17 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_safe(<2 x half> %a, <2 x half> % ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_legacy_f32_e32 v2, v3, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_safe: @@ -1195,16 +1199,17 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_flag(<2 x half> %a, <2 x ha ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_legacy_f32_e32 v2, v3, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_flag: @@ -1266,16 +1271,17 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nsz_flag(<2 x half> %a, <2 x hal ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_legacy_f32_e32 v2, v3, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_nsz_flag: @@ -1335,13 +1341,13 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag(<2 x half> %a, <2 ; GFX7-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1373,30 +1379,32 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_safe(<4 x half> %a, <4 x half> % ; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_safe: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_min_legacy_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_min_legacy_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v0 +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v15, v14 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_safe: @@ -1475,30 +1483,32 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_flag(<4 x half> %a, <4 x ha ; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_min_legacy_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_min_legacy_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v0 +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v15, v14 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_flag: @@ -1577,30 +1587,32 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nsz_flag(<4 x half> %a, <4 x hal ; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_min_legacy_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_min_legacy_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX7-NEXT: v_min_legacy_f32_e32 v1, v3, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v0 +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v15, v14 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_nsz_flag: @@ -1679,29 +1691,29 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag(<4 x half> %a, <4 ; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_min_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_min_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1731,30 +1743,32 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_safe(<4 x half> %a, <4 x half> % ; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_safe: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_legacy_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_max_legacy_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v0 +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v11, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v9, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v15, v14 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v13, v12 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_safe: @@ -1833,30 +1847,32 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_flag(<4 x half> %a, <4 x ha ; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_legacy_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_max_legacy_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v0 +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v11, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v9, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v15, v14 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v13, v12 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_flag: @@ -1935,30 +1951,32 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nsz_flag(<4 x half> %a, <4 x hal ; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_legacy_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_max_legacy_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v2, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX7-NEXT: v_max_legacy_f32_e32 v1, v3, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v0 +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v11, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v9, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v15, v14 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v13, v12 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_nsz_flag: @@ -2037,29 +2055,29 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag(<4 x half> %a, <4 ; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_max_f32_e32 v4, v5, v4 -; GFX7-NEXT: v_max_f32_e32 v6, v7, v6 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll b/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll index ba04cdb795ce3..563e95f7f55b5 100644 --- a/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll @@ -15,14 +15,12 @@ define void @phi_vec1half_to_f32_with_const_folding(ptr addrspace(1) %dst) #0 { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: ; %bb.1: ; %bb -; CHECK-NEXT: v_cvt_f16_f32_e64 v2, s4 -; CHECK-NEXT: s_mov_b32 s7, 0xf000 -; CHECK-NEXT: s_mov_b32 s6, 0 -; CHECK-NEXT: s_mov_b32 s4, s6 -; CHECK-NEXT: s_mov_b32 s5, s6 -; CHECK-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 offset:2 -; CHECK-NEXT: v_cvt_f16_f32_e64 v2, s4 -; CHECK-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 +; CHECK-NEXT: s_mov_b32 s11, 0xf000 +; CHECK-NEXT: s_mov_b32 s10, 0 +; CHECK-NEXT: s_mov_b32 s8, s10 +; CHECK-NEXT: s_mov_b32 s9, s10 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-NEXT: buffer_store_short v2, v[0:1], s[8:11], 0 addr64 offset:2 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -48,17 +46,13 @@ define void @phi_vec1half_to_f32(ptr addrspace(1) %src, ptr addrspace(1) %dst) # ; CHECK-NEXT: s_mov_b32 s4, s6 ; CHECK-NEXT: s_mov_b32 s5, s6 ; CHECK-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cvt_f32_f16_e64 v0, v0 ; CHECK-NEXT: ; %bb.1: ; %bb -; CHECK-NEXT: v_cvt_f16_f32_e64 v0, v0 ; CHECK-NEXT: s_mov_b32 s7, 0xf000 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s4, s6 ; CHECK-NEXT: s_mov_b32 s5, s6 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 offset:2 -; CHECK-NEXT: v_cvt_f16_f32_e64 v0, s4 -; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index 52cb3935b9a01..195d222408139 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -10,39 +10,36 @@ define amdgpu_kernel void @select_f16( ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 ; SI-NEXT: s_mov_b32 s18, s2 +; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s10 ; SI-NEXT: s_mov_b32 s17, s11 -; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_mov_b32 s20, s12 ; SI-NEXT: s_mov_b32 s21, s13 ; SI-NEXT: s_mov_b32 s22, s2 ; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 ; SI-NEXT: s_mov_b32 s12, s14 ; SI-NEXT: s_mov_b32 s13, s15 ; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -185,6 +182,8 @@ define amdgpu_kernel void @select_f16_imm_a( ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s18, s10 @@ -193,8 +192,6 @@ define amdgpu_kernel void @select_f16_imm_a( ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc @@ -202,11 +199,8 @@ define amdgpu_kernel void @select_f16_imm_a( ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -329,6 +323,8 @@ define amdgpu_kernel void @select_f16_imm_b( ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s18, s10 @@ -337,8 +333,6 @@ define amdgpu_kernel void @select_f16_imm_b( ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc @@ -346,11 +340,8 @@ define amdgpu_kernel void @select_f16_imm_b( ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -477,24 +468,23 @@ define amdgpu_kernel void @select_f16_imm_c( ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, 0x3800 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 -; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -620,24 +610,23 @@ define amdgpu_kernel void @select_f16_imm_d( ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, 0x3800 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -752,54 +741,47 @@ define amdgpu_kernel void @select_v2f16( ; SI-LABEL: select_v2f16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s18, s2 +; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s10 ; SI-NEXT: s_mov_b32 s17, s11 -; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 ; SI-NEXT: s_mov_b32 s20, s12 ; SI-NEXT: s_mov_b32 s21, s13 ; SI-NEXT: s_mov_b32 s22, s2 ; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: s_mov_b32 s12, s14 ; SI-NEXT: s_mov_b32 s13, s15 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cmp_lt_f32_e32 vcc, v5, v6 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; SI-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -960,11 +942,11 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 @@ -975,25 +957,17 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc -; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cndmask_b32_e32 v3, v2, v1, vcc +; SI-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -1135,11 +1109,11 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 @@ -1150,25 +1124,17 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc -; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cndmask_b32_e32 v3, v2, v1, vcc +; SI-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -1315,35 +1281,33 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 -; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 +; SI-NEXT: v_mov_b32_e32 v3, 0x3800 +; SI-NEXT: v_mov_b32_e32 v4, 0x3900 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v5 -; SI-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v2, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v5, v6 +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -1489,34 +1453,32 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 -; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 +; SI-NEXT: v_mov_b32_e32 v3, 0x3800 +; SI-NEXT: v_mov_b32_e32 v4, 0x3900 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5 -; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v5, v6 +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -1679,34 +1641,17 @@ define <4 x half> @v_vselect_v4f16(<4 x half> %a, <4 x half> %b, <4 x i32> %cond ; SI-LABEL: v_vselect_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; SI-NEXT: v_cndmask_b32_e32 v7, v11, v10, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; SI-NEXT: v_cndmask_b32_e32 v5, v9, v8, vcc +; SI-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; SI-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_vselect_v4f16: @@ -1807,62 +1752,27 @@ define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond ; SI-LABEL: v_vselect_v8f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; SI-NEXT: v_cndmask_b32_e32 v17, v18, v17, vcc -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_cndmask_b32_e64 v13, v13, v19, s[4:5] -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cndmask_b32_e32 v9, v15, v16, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; SI-NEXT: v_cndmask_b32_e32 v15, v7, v3, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 ; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 +; SI-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 ; SI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 ; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; SI-NEXT: v_cndmask_b32_e32 v5, v4, v0, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_bfi_b32 v0, s4, v0, v5 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v7 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v15 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_vselect_v8f16: @@ -2010,120 +1920,49 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; SI-LABEL: v_vselect_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cndmask_b32_e32 v29, v32, v31, vcc -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cndmask_b32_e32 v27, v32, v31, vcc -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cndmask_b32_e32 v25, v32, v31, vcc -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cndmask_b32_e32 v23, v32, v31, vcc -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cndmask_b32_e32 v21, v32, v31, vcc -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cndmask_b32_e32 v19, v32, v31, vcc -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cndmask_b32_e32 v17, v32, v31, vcc -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33 -; SI-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; SI-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v28 -; SI-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc +; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v28 +; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v29 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 +; SI-NEXT: v_cndmask_b32_e64 v26, v14, v6, s[6:7] +; SI-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[4:5] +; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v27 +; SI-NEXT: v_cndmask_b32_e64 v14, v13, v5, s[4:5] ; SI-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 +; SI-NEXT: v_cndmask_b32_e32 v25, v12, v4, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 ; SI-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 +; SI-NEXT: v_cndmask_b32_e32 v12, v11, v3, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 ; SI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21 +; SI-NEXT: v_cndmask_b32_e32 v11, v10, v2, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20 ; SI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; SI-NEXT: v_cndmask_b32_e32 v10, v9, v1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 ; SI-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; SI-NEXT: v_cndmask_b32_e32 v9, v8, v0, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 ; SI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v1, v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v2, v2, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v3, v3, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v4, v4, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; SI-NEXT: v_cndmask_b32_e32 v8, v15, v7, vcc +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_bfi_b32 v0, s4, v0, v9 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v10 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v11 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v12 +; SI-NEXT: v_bfi_b32 v4, s4, v4, v25 +; SI-NEXT: v_bfi_b32 v5, s4, v5, v14 +; SI-NEXT: v_bfi_b32 v6, s4, v6, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 +; SI-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc +; SI-NEXT: v_bfi_b32 v7, s4, v8, v7 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_vselect_v16f16: @@ -2388,324 +2227,146 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; SI-LABEL: v_vselect_v32f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v11 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:96 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cndmask_b32_e32 v31, v40, v37, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33 -; SI-NEXT: v_cndmask_b32_e32 v33, v42, v41, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v35 -; SI-NEXT: v_cndmask_b32_e32 v35, v45, v43, vcc -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v36 -; SI-NEXT: v_cndmask_b32_e32 v36, v47, v46, vcc -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v59 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cndmask_b32_e32 v37, v57, v56, vcc -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v38 -; SI-NEXT: v_cndmask_b32_e32 v38, v34, v58, vcc +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v39 -; SI-NEXT: v_cndmask_b32_e32 v34, v41, v40, vcc -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v37 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cndmask_b32_e64 v37, v38, v15, s[14:15] +; SI-NEXT: v_cndmask_b32_e64 v15, v38, v15, s[12:13] +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v55 -; SI-NEXT: v_cndmask_b32_e32 v39, v40, v39, vcc +; SI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v39 +; SI-NEXT: v_cndmask_b32_e64 v39, v30, v14, s[12:13] +; SI-NEXT: v_cndmask_b32_e64 v14, v30, v14, s[10:11] +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v48 +; SI-NEXT: v_cndmask_b32_e64 v48, v29, v13, s[10:11] ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v50 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; SI-NEXT: v_cndmask_b32_e32 v50, v42, v41, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v49 +; SI-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[10:11] ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v51 -; SI-NEXT: v_cndmask_b32_e32 v51, v40, v55, vcc +; SI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v50 +; SI-NEXT: v_cndmask_b32_e64 v50, v28, v12, s[10:11] +; SI-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[8:9] +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v51 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:44 +; SI-NEXT: v_cndmask_b32_e64 v28, v27, v11, s[8:9] ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: v_cndmask_b32_e32 v52, v42, v41, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v41, v55 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v55 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v32 +; SI-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[8:9] ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v53 -; SI-NEXT: v_cndmask_b32_e32 v53, v42, v41, vcc -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v54 -; SI-NEXT: v_cndmask_b32_e32 v54, v47, v43, vcc -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v43 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v43 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v49 -; SI-NEXT: v_cndmask_b32_e32 v49, v56, v47, vcc -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v48 -; SI-NEXT: v_cndmask_b32_e32 v48, v58, v57, vcc -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v33 +; SI-NEXT: v_cndmask_b32_e64 v27, v26, v10, s[8:9] +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v31 +; SI-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[8:9] +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v44 -; SI-NEXT: v_cndmask_b32_e32 v44, v58, v56, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v32 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v34 +; SI-NEXT: v_cndmask_b32_e64 v34, v25, v9, s[8:9] +; SI-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[6:7] +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 -; SI-NEXT: v_cndmask_b32_e32 v15, v58, v15, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v58, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; SI-NEXT: v_cndmask_b32_e32 v14, v58, v14, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v58, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v55 -; SI-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc +; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v35 +; SI-NEXT: v_cndmask_b32_e64 v35, v24, v8, s[6:7] +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v36 +; SI-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[6:7] +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v40 -; SI-NEXT: v_cndmask_b32_e32 v13, v28, v58, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v28, v10 +; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v38 +; SI-NEXT: v_cndmask_b32_e64 v38, v23, v7, s[6:7] +; SI-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v41 -; SI-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v42 -; SI-NEXT: v_cndmask_b32_e32 v11, v26, v28, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v43 -; SI-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v59 -; SI-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v47 -; SI-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v57 +; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:8 +; SI-NEXT: v_cndmask_b32_e64 v23, v22, v6, s[4:5] ; SI-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v60 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:4 +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_bfi_b32 v6, s4, v6, v23 +; SI-NEXT: v_bfi_b32 v7, s4, v7, v38 +; SI-NEXT: v_bfi_b32 v8, s4, v8, v35 +; SI-NEXT: v_bfi_b32 v9, s4, v9, v34 +; SI-NEXT: v_bfi_b32 v10, s4, v10, v27 +; SI-NEXT: v_bfi_b32 v11, s4, v11, v28 +; SI-NEXT: v_bfi_b32 v12, s4, v12, v50 +; SI-NEXT: v_bfi_b32 v13, s4, v13, v48 +; SI-NEXT: v_bfi_b32 v14, s4, v14, v39 +; SI-NEXT: v_bfi_b32 v15, s4, v15, v37 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 +; SI-NEXT: v_cndmask_b32_e32 v26, v21, v5, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v49 ; SI-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v56 -; SI-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 +; SI-NEXT: v_cndmask_b32_e32 v21, v20, v4, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v29 +; SI-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cndmask_b32_e32 v20, v19, v3, vcc +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 ; SI-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33 +; SI-NEXT: v_cndmask_b32_e32 v19, v18, v2, vcc ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v45 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v36 ; SI-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v51 +; SI-NEXT: v_cndmask_b32_e32 v18, v17, v1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 +; SI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; SI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v17, v16, v0, vcc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v46 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 ; SI-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v16, v44 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_or_b32_e32 v1, v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v2, v2, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_or_b32_e32 v3, v3, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v4, v4, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_or_b32_e32 v5, v5, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v6, v6, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_or_b32_e32 v7, v7, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v8, v8, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_or_b32_e32 v9, v9, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v35 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v33 -; SI-NEXT: v_or_b32_e32 v11, v18, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v31 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v34 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_or_b32_e32 v13, v18, v13 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfi_b32 v0, s4, v0, v17 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v18 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v19 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v20 +; SI-NEXT: v_bfi_b32 v4, s4, v4, v21 +; SI-NEXT: v_bfi_b32 v5, s4, v5, v26 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_vselect_v32f16: diff --git a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll index 9a52b96bde709..80bf0b1336b01 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll @@ -10,6 +10,7 @@ define float @v_constrained_fpext_f16_to_f32_fpexcept_strict(half %arg) #0 { ; SI-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -44,9 +45,10 @@ define <2 x float> @v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict(<2 x half ; SI-LABEL: v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict: @@ -88,11 +90,12 @@ define <3 x float> @v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict(<3 x half ; SI-LABEL: v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict: @@ -197,6 +200,7 @@ define double @v_constrained_fpext_f16_to_f64_fpexcept_strict(half %arg) #0 { ; SI-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -236,8 +240,9 @@ define <2 x double> @v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict(<2 x hal ; SI-LABEL: v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 @@ -287,10 +292,12 @@ define <3 x double> @v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict(<3 x hal ; SI-LABEL: v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 ; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 @@ -348,7 +355,9 @@ define float @v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict(half %arg) #0 ; SI-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict: @@ -387,8 +396,9 @@ define float @v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict(half %arg) #0 ; SI-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict: @@ -507,11 +517,12 @@ define <2 x float> @v_constrained_fpext_v2f16_to_v2f32_noabi(ptr addrspace(1) %p ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi: diff --git a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll index 31c64046de11a..87b41815e36d5 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll @@ -11,9 +11,6 @@ define half @v_constrained_fptrunc_f32_to_f16_fpexcept_strict(float %arg) #0 { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict: @@ -49,13 +46,8 @@ define <2 x half> @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict(<2 x flo ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -107,19 +99,12 @@ define <3 x half> @v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict(<3 x flo ; SI-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -226,9 +211,7 @@ define half @v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict(float %arg) # ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict: @@ -268,9 +251,6 @@ define half @v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict(float %arg) # ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict: @@ -329,8 +309,6 @@ define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi(float %arg, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi: @@ -368,15 +346,10 @@ define void @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi(<2 x flo ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -438,8 +411,6 @@ define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg(float % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg: @@ -475,8 +446,6 @@ define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs(float % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, |v0| -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs: diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll index 9fe064c717972..ef2a06935f20a 100644 --- a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll +++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll @@ -17,6 +17,7 @@ define void @f16_arg(half %arg, ptr %ptr) #0 { ; GFX7-LABEL: f16_arg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: flat_store_dword v[1:2], v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -30,13 +31,14 @@ define void @v2f16_arg(<2 x half> %arg, ptr %ptr) #0 { ; GFX7-LABEL: v2f16_arg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v1 -; GFX7-NEXT: flat_store_dword v[1:2], v3 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX7-NEXT: flat_store_dword v[0:1], v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v1 +; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc +; GFX7-NEXT: flat_store_dword v[3:4], v5 +; GFX7-NEXT: flat_store_dword v[1:2], v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %fpext = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %arg, metadata !"fpexcept.strict") @@ -48,17 +50,19 @@ define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 { ; GFX7-LABEL: v3f16_arg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX7-NEXT: flat_store_dword v[0:1], v6 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX7-NEXT: flat_store_dword v[2:3], v4 ; GFX7-NEXT: flat_store_dword v[0:1], v5 +; GFX7-NEXT: flat_store_dword v[2:3], v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %fpext = call <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half> %arg, metadata !"fpexcept.strict") @@ -70,22 +74,24 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { ; GFX7-LABEL: v4f16_arg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: flat_store_dword v[0:1], v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: flat_store_dword v[0:1], v4 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX7-NEXT: flat_store_dword v[0:1], v6 +; GFX7-NEXT: flat_store_dword v[0:1], v7 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX7-NEXT: flat_store_dword v[0:1], v4 +; GFX7-NEXT: flat_store_dword v[0:1], v5 +; GFX7-NEXT: flat_store_dword v[2:3], v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %fpext = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %arg, metadata !"fpexcept.strict") @@ -98,9 +104,6 @@ define half @f16_return(float %arg) #0 { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %fptrunc @@ -112,13 +115,8 @@ define <2 x half> @v2f16_return(<2 x float> %arg) #0 { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") @@ -129,19 +127,12 @@ define <3 x half> @v3f16_return(<3 x float> %arg) #0 { ; GFX7-LABEL: v3f16_return: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") @@ -153,25 +144,15 @@ define <4 x half> @v4f16_return(<4 x float> %arg) #0 { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %fptrunc @@ -421,15 +402,10 @@ define half @call_split_type_used_outside_block_v8f16() #0 { ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: v_writelane_b32 v40, s31, 1 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_readlane_b32 s31, v40, 1 ; GFX7-NEXT: v_readlane_b32 s30, v40, 0 ; GFX7-NEXT: s_mov_b32 s32, s33 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_readlane_b32 s4, v40, 2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index ca93fcf3f55a2..c3a7e2ae4f344 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -2175,11 +2175,10 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e64 v1, |v0| -; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/v_mac.ll b/llvm/test/CodeGen/AMDGPU/v_mac.ll index f5dc824aae35f..d987e7c65e692 100644 --- a/llvm/test/CodeGen/AMDGPU/v_mac.ll +++ b/llvm/test/CodeGen/AMDGPU/v_mac.ll @@ -252,8 +252,16 @@ bb: ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[B]] ; SI: v_add_f32_e32 [[TMP2:v[0-9]+]], [[CVT_A]], [[CVT_A]] -; SI: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 -; SI: v_madmk_f32 v{{[0-9]+}}, v{{[0-9]+}}, 0x41000000, v{{[0-9]+}} +; SI: v_cvt_f16_f32 +; SI: v_cvt_f32_f16 +; SI: v_mul_f32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f16_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f16_f32 ; VI-FLUSH: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]] ; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll index 8da6f2348690a..34cf771fae45e 100644 --- a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll @@ -1,16 +1,19 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI %s +; FIXME: Can the SI case form the mac through the casts? + ; GCN-LABEL: {{^}}mac_f16: ; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] ; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] ; GCN: {{buffer|flat}}_load_ushort v[[C_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] -; SI: buffer_store_short v[[R_F16]] + +; SI: v_mul_f32_e32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_add_f32_e32 + ; VI: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]] ; VI: buffer_store_short v[[C_F16]] ; GCN: s_endpgm @@ -32,8 +35,14 @@ entry: } ; GCN-LABEL: {{^}}mac_f16_same_add: -; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] -; SI: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_mul_f32_e32 +; SI: v_mul_f32_e32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_add_f32_e32 +; SI: v_add_f32_e32 ; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] ; VI: v_mac_f16_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} @@ -67,8 +76,12 @@ entry: ; GCN-LABEL: {{^}}mac_f16_neg_a: ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] +; SI: v_cvt_f32_f16 +; SI: v_sub_f32 + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -94,8 +107,12 @@ entry: ; GCN-LABEL: {{^}}mac_f16_neg_b: ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_mul_f32_e32 +; SI: v_cvt_f16_f32 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] +; SI: v_cvt_f32_f16 +; SI: v_sub_f32_e32 +; SI: v_cvt_f16_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -121,8 +138,12 @@ entry: ; GCN-LABEL: {{^}}mac_f16_neg_c: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 +; SI: v_mul_f32_e32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} @@ -146,8 +167,16 @@ entry: } ; GCN-LABEL: {{^}}mac_f16_neg_a_safe_fp_math: -; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}} + +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f32_f16 +; SI: v_add_f32 +; SI: v_cvt_f16_f32 + ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} ; GCN: s_endpgm @@ -170,8 +199,15 @@ entry: } ; GCN-LABEL: {{^}}mac_f16_neg_b_safe_fp_math: -; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] + +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_mul_f32_e32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f32_f16 +; SI: v_add_f32_e32 +; SI: v_cvt_f16_f32 + ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] ; GCN: s_endpgm @@ -194,8 +230,14 @@ entry: } ; GCN-LABEL: {{^}}mac_f16_neg_c_safe_fp_math: -; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_mac_f32_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_mul_f32_e32 +; SI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -v{{[0-9]}} +; SI: v_add_f32_e32 +; SI: v_cvt_f16_f32 + ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_mac_f16_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm @@ -220,8 +262,12 @@ entry: ; GCN-LABEL: {{^}}mac_f16_neg_a_nsz_fp_math: ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_mul_f32_e32 +; SI: v_cvt_f16_f32 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] +; SI: v_cvt_f32_f16 +; SI: v_sub_f32_e32 +; SI: v_cvt_f16_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} @@ -247,8 +293,11 @@ entry: ; GCN-LABEL: {{^}}mac_f16_neg_b_nsz_fp_math: ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] +; SI: v_sub_f32_e32 +; SI: v_cvt_f16_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} @@ -274,8 +323,12 @@ entry: ; GCN-LABEL: {{^}}mac_f16_neg_c_nsz_fp_math: ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_mul_f32_e32 +; SI: v_cvt_f16_f32 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, [[CVT_A]], [[CVT_B]], -[[CVT_C]] +; SI: v_cvt_f32_f16 +; SI: v_sub_f32_e32 +; SI: v_cvt_f16_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}} @@ -299,30 +352,16 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16: -; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] -; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]] - -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] - -; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] - -; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] -; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] - -; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] -; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] -; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; VI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] - +; SI: v_mul_f32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f32_f16 +; SI: v_add_f32 +; SI: v_add_f32 + +; VI: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] +; VI: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] +; VI: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]] ; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] ; VI-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]] @@ -330,8 +369,8 @@ entry: ; VI-NOT: and ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]] -; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]] -; GCN: s_endpgm +; VI: {{buffer|flat}}_store_dword v[[R_V2_F16]] +; VI: s_endpgm define amdgpu_kernel void @mac_v2f16( ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -352,10 +391,14 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_same_add: -; SI-DAG: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_mul_f32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f16_f32 +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_add_f32 +; SI: v_add_f32 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-DAG: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -390,11 +433,12 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_a: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} - -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_mul_f32 +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -419,10 +463,15 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_b -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32_e32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32_e32 ; VI-NOT: v_mac_f16 @@ -448,15 +497,15 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_c: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_mul_f32 +; SI: v_cvt_f32_f16 -; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} @@ -481,11 +530,16 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_a_safe_fp_math: - -; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_mul_f32 +; SI: v_cvt_f32_f16 +; SI: v_add_f32 +; SI: v_cvt_f16_f32 +; SI: v_add_f32 +; SI: v_cvt_f16_f32 ; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI-DAG: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} @@ -513,11 +567,14 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math: - -; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_add_f32 +; SI: v_add_f32 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} @@ -545,11 +602,12 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math: - -; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_mul_f32 +; SI: v_mul_f32 +; SI: v_add_f32 +; SI: v_add_f32 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} @@ -577,15 +635,16 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_a_nsz_fp_math: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} - -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} @@ -610,15 +669,16 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_b_nsz_fp_math: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} - -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} @@ -643,15 +703,16 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_c_nsz_fp_math: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} - -; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_mul_f32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 +; SI: v_sub_f32 +; SI: v_cvt_f16_f32 ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll index 3afe55fc93423..b675e0ffe9eed 100644 --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -11,22 +11,25 @@ define amdgpu_kernel void @madak_f16( ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_madak_f32 v0, v0, v1, 0x41200000 +; SI-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x41200000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -140,7 +143,6 @@ define amdgpu_kernel void @madak_f16_use_2( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, 0x41200000 ; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: s_mov_b32 s4, s10 @@ -148,12 +150,18 @@ define amdgpu_kernel void @madak_f16_use_2( ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_madak_f32 v1, v0, v1, 0x41200000 -; SI-NEXT: v_mac_f32_e32 v3, v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 +; SI-NEXT: v_mul_f32_e32 v1, v0, v1 +; SI-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x41200000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x41200000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v1, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: madak_f16_use_2: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll index deb140fa7e941..0ec4c18a070fe 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll @@ -20,12 +20,14 @@ define half @test_vector_reduce_fadd_v2half(half %sp, <2 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v2half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -158,14 +160,18 @@ define half @test_vector_reduce_fadd_v3half(half %sp, <3 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v3half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -311,17 +317,23 @@ define half @test_vector_reduce_fadd_v4half(half %sp, <4 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v4half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -499,27 +511,41 @@ define half @test_vector_reduce_fadd_v8half(half %sp, <8 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v8half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v7 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v6 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -787,47 +813,77 @@ define half @test_vector_reduce_fadd_v16half(half %sp, <16 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v16half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v9 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v16 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v14 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v13 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v12 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v11 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v7 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v10 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v9 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll index 4c212daab39ee..44b8b8bcb9ae8 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll @@ -182,30 +182,19 @@ entry: } define half @test_vector_reduce_fmax_v3half(<3 x half> %v) { -; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v3half: -; GFX7-SDAG: ; %bb.0: ; %entry -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v2, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v3half: -; GFX7-GISEL: ; %bb.0: ; %entry -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: test_vector_reduce_fmax_v3half: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_fmax_v3half: ; GFX8-SDAG: ; %bb.0: ; %entry @@ -390,14 +379,19 @@ define half @test_vector_reduce_fmax_v4half(<4 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v4half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -618,22 +612,37 @@ define half @test_vector_reduce_fmax_v8half(<8 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v8half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v7 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v1, v6 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v2, v5 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v3, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -966,38 +975,73 @@ define half @test_vector_reduce_fmax_v16half(<16 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v16half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v1, v14 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v2, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v3, v12 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v4, v11 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v5, v10 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v6, v9 -; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v7, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3584,6 +3628,5 @@ declare double @llvm.vector.reduce.fmax.v16double(<16 x double>) ; GFX10: {{.*}} ; GFX11: {{.*}} ; GFX12: {{.*}} -; GFX7: {{.*}} ; GFX8: {{.*}} ; GFX9: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll index d198bb45654da..a20b5de786271 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll @@ -102,14 +102,16 @@ define half @test_vector_reduce_fmaximum_v3half(<3 x half> %v) { ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX7-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -217,21 +219,25 @@ define half @test_vector_reduce_fmaximum_v4half(<4 x half> %v) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX7-NEXT: v_max_f32_e32 v1, v0, v3 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -353,40 +359,52 @@ define half @test_vector_reduce_fmaximum_v8half(<8 x half> %v) { ; GFX7-LABEL: test_vector_reduce_fmaximum_v8half: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, 0x7fc00000 +; GFX7-NEXT: v_max_f32_e32 v5, v0, v4 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_max_f32_e32 v8, v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v0, v1 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v7, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v6 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_max_f32_e32 v1, v0, v3 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_max_f32_e32 v1, v0, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v5, v0, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -580,76 +598,104 @@ define half @test_vector_reduce_fmaximum_v16half(<16 x half> %v) { ; GFX7-LABEL: test_vector_reduce_fmaximum_v16half: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GFX7-NEXT: v_mov_b32_e32 v10, 0x7fc00000 +; GFX7-NEXT: v_max_f32_e32 v9, v0, v8 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v9, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-NEXT: v_max_f32_e32 v16, v0, v15 -; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v15 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_max_f32_e32 v15, v0, v1 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v14 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v14 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v13 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v13 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v3 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v12 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v12 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v4 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v11 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v11 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v10 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v10 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v6 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_max_f32_e32 v1, v0, v9 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v9 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_max_f32_e32 v1, v0, v7 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_max_f32_e32 v1, v0, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v9, v0, v8 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v9, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v8, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v8, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v8, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v8, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll index 479dc08a4f7aa..ed5c910def3d6 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll @@ -182,30 +182,19 @@ entry: } define half @test_vector_reduce_fmin_v3half(<3 x half> %v) { -; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v3half: -; GFX7-SDAG: ; %bb.0: ; %entry -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v2, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v3half: -; GFX7-GISEL: ; %bb.0: ; %entry -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: test_vector_reduce_fmin_v3half: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_fmin_v3half: ; GFX8-SDAG: ; %bb.0: ; %entry @@ -390,14 +379,19 @@ define half @test_vector_reduce_fmin_v4half(<4 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v4half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -618,22 +612,37 @@ define half @test_vector_reduce_fmin_v8half(<8 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v8half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v7 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v1, v6 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v2, v5 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v3, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -966,38 +975,73 @@ define half @test_vector_reduce_fmin_v16half(<16 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v16half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v1 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v1, v14 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v2, v13 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v3, v12 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v4, v11 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v5, v10 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v6, v9 -; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v7, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3583,6 +3627,5 @@ declare double @llvm.vector.reduce.fmin.v16double(<16 x double>) ; GFX10: {{.*}} ; GFX11: {{.*}} ; GFX12: {{.*}} -; GFX7: {{.*}} ; GFX8: {{.*}} ; GFX9: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll index 506d847c1144b..63e42e1e8a320 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll @@ -126,14 +126,16 @@ define half @test_vector_reduce_fminimum_v3half(<3 x half> %v) { ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX7-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -266,21 +268,25 @@ define half @test_vector_reduce_fminimum_v4half(<4 x half> %v) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX7-NEXT: v_min_f32_e32 v1, v0, v3 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -431,40 +437,52 @@ define half @test_vector_reduce_fminimum_v8half(<8 x half> %v) { ; GFX7-LABEL: test_vector_reduce_fminimum_v8half: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, 0x7fc00000 +; GFX7-NEXT: v_min_f32_e32 v5, v0, v4 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_min_f32_e32 v8, v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v0, v1 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v7, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v6 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_min_f32_e32 v1, v0, v3 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX7-NEXT: v_min_f32_e32 v1, v0, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v5, v0, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -700,76 +718,104 @@ define half @test_vector_reduce_fminimum_v16half(<16 x half> %v) { ; GFX7-LABEL: test_vector_reduce_fminimum_v16half: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GFX7-NEXT: v_mov_b32_e32 v10, 0x7fc00000 +; GFX7-NEXT: v_min_f32_e32 v9, v0, v8 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v9, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-NEXT: v_min_f32_e32 v16, v0, v15 -; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v15 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_min_f32_e32 v15, v0, v1 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v14 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v14 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v13 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v13 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v3 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v12 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v12 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v4 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v11 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v11 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v10 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v10 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v6 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_min_f32_e32 v1, v0, v9 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v9 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_min_f32_e32 v1, v0, v7 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX7-NEXT: v_min_f32_e32 v1, v0, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v9, v0, v8 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v9, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v8, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v8, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v8, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v8, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v10, v2, vcc ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll index 7ea92e7b3582c..57dc288bf6dcd 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll @@ -20,12 +20,14 @@ define half @test_vector_reduce_fmul_v2half(half %sp, <2 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v2half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -158,14 +160,18 @@ define half @test_vector_reduce_fmul_v3half(half %sp, <3 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v3half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -311,17 +317,23 @@ define half @test_vector_reduce_fmul_v4half(half %sp, <4 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v4half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -499,27 +511,41 @@ define half @test_vector_reduce_fmul_v8half(half %sp, <8 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v8half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v8 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v7 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v5 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -787,47 +813,77 @@ define half @test_vector_reduce_fmul_v16half(half %sp, <16 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v16half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v9 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v8 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v16 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v15 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v14 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v13 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v5 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v12 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v11 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v7 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v10 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v8 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v9 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ;